radeonsi: move VS shader code into si_shader_llvm_vs.c
[mesa.git] / src / gallium / drivers / radeonsi / gfx10_shader_ngg.c
1 /*
2 * Copyright 2017 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "si_pipe.h"
25 #include "si_shader_internal.h"
26
27 #include "sid.h"
28
29 #include "util/u_memory.h"
30 #include "util/u_prim.h"
31 #include "ac_llvm_cull.h"
32
33 static LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx)
34 {
35 return si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
36 }
37
38 static LLVMValueRef get_tgsize(struct si_shader_context *ctx)
39 {
40 return si_unpack_param(ctx, ctx->merged_wave_info, 28, 4);
41 }
42
43 static LLVMValueRef get_thread_id_in_tg(struct si_shader_context *ctx)
44 {
45 LLVMBuilderRef builder = ctx->ac.builder;
46 LLVMValueRef tmp;
47 tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
48 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
49 return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
50 }
51
52 static LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx)
53 {
54 return si_unpack_param(ctx, ctx->gs_tg_info, 12, 9);
55 }
56
57 static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
58 {
59 return si_unpack_param(ctx, ctx->gs_tg_info, 22, 9);
60 }
61
62 static LLVMValueRef ngg_get_ordered_id(struct si_shader_context *ctx)
63 {
64 return si_unpack_param(ctx, ctx->gs_tg_info, 0, 12);
65 }
66
67 static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
68 {
69 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
70
71 return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
72 LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false));
73 }
74
75 static LLVMValueRef ngg_get_initial_edgeflag(struct si_shader_context *ctx, unsigned index)
76 {
77 if (ctx->type == PIPE_SHADER_VERTEX) {
78 LLVMValueRef tmp;
79 tmp = LLVMBuildLShr(ctx->ac.builder,
80 ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id),
81 LLVMConstInt(ctx->ac.i32, 8 + index, false), "");
82 return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
83 }
84 return ctx->ac.i1false;
85 }
86
87 /**
88 * Return the number of vertices as a constant in \p num_vertices,
89 * and return a more precise value as LLVMValueRef from the function.
90 */
91 static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx,
92 unsigned *num_vertices)
93 {
94 const struct si_shader_info *info = &ctx->shader->selector->info;
95
96 if (ctx->type == PIPE_SHADER_VERTEX) {
97 if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
98 /* Blits always use axis-aligned rectangles with 3 vertices. */
99 *num_vertices = 3;
100 return LLVMConstInt(ctx->ac.i32, 3, 0);
101 } else {
102 /* We always build up all three indices for the prim export
103 * independent of the primitive type. The additional garbage
104 * data shouldn't hurt. This number doesn't matter with
105 * NGG passthrough.
106 */
107 *num_vertices = 3;
108
109 /* Extract OUTPRIM field. */
110 LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2);
111 return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, "");
112 }
113 } else {
114 assert(ctx->type == PIPE_SHADER_TESS_EVAL);
115
116 if (info->properties[TGSI_PROPERTY_TES_POINT_MODE])
117 *num_vertices = 1;
118 else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
119 *num_vertices = 2;
120 else
121 *num_vertices = 3;
122
123 return LLVMConstInt(ctx->ac.i32, *num_vertices, false);
124 }
125 }
126
127 bool gfx10_ngg_export_prim_early(struct si_shader *shader)
128 {
129 struct si_shader_selector *sel = shader->selector;
130
131 assert(shader->key.as_ngg && !shader->key.as_es);
132
133 return sel->type != PIPE_SHADER_GEOMETRY &&
134 !sel->info.writes_edgeflag;
135 }
136
137 void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx)
138 {
139 ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
140 ngg_get_vtx_cnt(ctx),
141 ngg_get_prim_cnt(ctx));
142 }
143
144 void gfx10_ngg_build_export_prim(struct si_shader_context *ctx,
145 LLVMValueRef user_edgeflags[3],
146 LLVMValueRef prim_passthrough)
147 {
148 LLVMBuilderRef builder = ctx->ac.builder;
149
150 if (gfx10_is_ngg_passthrough(ctx->shader) ||
151 ctx->shader->key.opt.ngg_culling) {
152 ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
153 {
154 struct ac_ngg_prim prim = {};
155
156 if (prim_passthrough)
157 prim.passthrough = prim_passthrough;
158 else
159 prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
160
161 /* This is only used with NGG culling, which returns the NGG
162 * passthrough prim export encoding.
163 */
164 if (ctx->shader->selector->info.writes_edgeflag) {
165 unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS;
166 LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0);
167
168 unsigned num_vertices;
169 ngg_get_vertices_per_prim(ctx, &num_vertices);
170
171 for (unsigned i = 0; i < num_vertices; i++) {
172 unsigned shift = 9 + i*10;
173 LLVMValueRef edge;
174
175 edge = LLVMBuildLoad(builder, user_edgeflags[i], "");
176 edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, "");
177 edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), "");
178 edgeflags = LLVMBuildOr(builder, edgeflags, edge, "");
179 }
180 prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, "");
181 }
182
183 ac_build_export_prim(&ctx->ac, &prim);
184 }
185 ac_build_endif(&ctx->ac, 6001);
186 return;
187 }
188
189 ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
190 {
191 struct ac_ngg_prim prim = {};
192
193 ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
194
195 prim.isnull = ctx->ac.i1false;
196 prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
197 prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
198 prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
199
200 for (unsigned i = 0; i < prim.num_vertices; ++i) {
201 prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
202
203 if (ctx->shader->selector->info.writes_edgeflag) {
204 LLVMValueRef edge;
205
206 edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], "");
207 edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, "");
208 prim.edgeflag[i] = edge;
209 }
210 }
211
212 ac_build_export_prim(&ctx->ac, &prim);
213 }
214 ac_build_endif(&ctx->ac, 6001);
215 }
216
217 static void build_streamout_vertex(struct si_shader_context *ctx,
218 LLVMValueRef *so_buffer, LLVMValueRef *wg_offset_dw,
219 unsigned stream, LLVMValueRef offset_vtx,
220 LLVMValueRef vertexptr)
221 {
222 struct si_shader_info *info = &ctx->shader->selector->info;
223 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
224 LLVMBuilderRef builder = ctx->ac.builder;
225 LLVMValueRef offset[4] = {};
226 LLVMValueRef tmp;
227
228 for (unsigned buffer = 0; buffer < 4; ++buffer) {
229 if (!wg_offset_dw[buffer])
230 continue;
231
232 tmp = LLVMBuildMul(builder, offset_vtx,
233 LLVMConstInt(ctx->ac.i32, so->stride[buffer], false), "");
234 tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
235 offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
236 }
237
238 for (unsigned i = 0; i < so->num_outputs; ++i) {
239 if (so->output[i].stream != stream)
240 continue;
241
242 unsigned reg = so->output[i].register_index;
243 struct si_shader_output_values out;
244 out.semantic_name = info->output_semantic_name[reg];
245 out.semantic_index = info->output_semantic_index[reg];
246
247 for (unsigned comp = 0; comp < 4; comp++) {
248 tmp = ac_build_gep0(&ctx->ac, vertexptr,
249 LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false));
250 out.values[comp] = LLVMBuildLoad(builder, tmp, "");
251 out.vertex_stream[comp] =
252 (info->output_streams[reg] >> (2 * comp)) & 3;
253 }
254
255 si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out);
256 }
257 }
258
259 struct ngg_streamout {
260 LLVMValueRef num_vertices;
261
262 /* per-thread data */
263 LLVMValueRef prim_enable[4]; /* i1 per stream */
264 LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */
265
266 /* Output */
267 LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
268 };
269
270 /**
271 * Build streamout logic.
272 *
273 * Implies a barrier.
274 *
275 * Writes number of emitted primitives to gs_ngg_scratch[4:8].
276 *
277 * Clobbers gs_ngg_scratch[8:].
278 */
279 static void build_streamout(struct si_shader_context *ctx,
280 struct ngg_streamout *nggso)
281 {
282 struct si_shader_info *info = &ctx->shader->selector->info;
283 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
284 LLVMBuilderRef builder = ctx->ac.builder;
285 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
286 LLVMValueRef tid = get_thread_id_in_tg(ctx);
287 LLVMValueRef tmp, tmp2;
288 LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
289 LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
290 LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
291 LLVMValueRef so_buffer[4] = {};
292 unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) +
293 (nggso->vertices[2] ? 1 : 0);
294 LLVMValueRef prim_stride_dw[4] = {};
295 LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
296 int stream_for_buffer[4] = { -1, -1, -1, -1 };
297 unsigned bufmask_for_stream[4] = {};
298 bool isgs = ctx->type == PIPE_SHADER_GEOMETRY;
299 unsigned scratch_emit_base = isgs ? 4 : 0;
300 LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
301 unsigned scratch_offset_base = isgs ? 8 : 4;
302 LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
303
304 ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
305
306 /* Determine the mapping of streamout buffers to vertex streams. */
307 for (unsigned i = 0; i < so->num_outputs; ++i) {
308 unsigned buf = so->output[i].output_buffer;
309 unsigned stream = so->output[i].stream;
310 assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
311 stream_for_buffer[buf] = stream;
312 bufmask_for_stream[stream] |= 1 << buf;
313 }
314
315 for (unsigned buffer = 0; buffer < 4; ++buffer) {
316 if (stream_for_buffer[buffer] == -1)
317 continue;
318
319 assert(so->stride[buffer]);
320
321 tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false);
322 prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, "");
323 prim_stride_dw_vgpr = ac_build_writelane(
324 &ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
325 LLVMConstInt(ctx->ac.i32, buffer, false));
326
327 so_buffer[buffer] = ac_build_load_to_sgpr(
328 &ctx->ac, buf_ptr,
329 LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false));
330 }
331
332 tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
333 ac_build_ifcc(&ctx->ac, tmp, 5200);
334 {
335 LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
336 LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
337
338 /* Advance the streamout offsets in GDS. */
339 LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
340 LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
341
342 tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
343 ac_build_ifcc(&ctx->ac, tmp, 5210);
344 {
345 if (isgs) {
346 tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
347 tmp = LLVMBuildLoad(builder, tmp, "");
348 } else {
349 tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0,
350 ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
351 }
352 LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
353
354 unsigned swizzle[4];
355 int unused_stream = -1;
356 for (unsigned stream = 0; stream < 4; ++stream) {
357 if (!info->num_stream_output_components[stream]) {
358 unused_stream = stream;
359 break;
360 }
361 }
362 for (unsigned buffer = 0; buffer < 4; ++buffer) {
363 if (stream_for_buffer[buffer] >= 0) {
364 swizzle[buffer] = stream_for_buffer[buffer];
365 } else {
366 assert(unused_stream >= 0);
367 swizzle[buffer] = unused_stream;
368 }
369 }
370
371 tmp = ac_build_quad_swizzle(&ctx->ac, tmp,
372 swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
373 tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
374
375 LLVMValueRef args[] = {
376 LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
377 tmp,
378 ctx->ac.i32_0, // ordering
379 ctx->ac.i32_0, // scope
380 ctx->ac.i1false, // isVolatile
381 LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
382 ctx->ac.i1true, // wave release
383 ctx->ac.i1true, // wave done
384 };
385 tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add",
386 ctx->ac.i32, args, ARRAY_SIZE(args), 0);
387
388 /* Keep offsets in a VGPR for quick retrieval via readlane by
389 * the first wave for bounds checking, and also store in LDS
390 * for retrieval by all waves later. */
391 LLVMBuildStore(builder, tmp, offsets_vgpr);
392
393 tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac),
394 scratch_offset_basev, "");
395 tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
396 LLVMBuildStore(builder, tmp, tmp2);
397 }
398 ac_build_endif(&ctx->ac, 5210);
399
400 /* Determine the max emit per buffer. This is done via the SALU, in part
401 * because LLVM can't generate divide-by-multiply if we try to do this
402 * via VALU with one lane per buffer.
403 */
404 LLVMValueRef max_emit[4] = {};
405 for (unsigned buffer = 0; buffer < 4; ++buffer) {
406 if (stream_for_buffer[buffer] == -1)
407 continue;
408
409 LLVMValueRef bufsize_dw =
410 LLVMBuildLShr(builder,
411 LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""),
412 i32_2, "");
413
414 tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
415 LLVMValueRef offset_dw =
416 ac_build_readlane(&ctx->ac, tmp,
417 LLVMConstInt(ctx->ac.i32, buffer, false));
418
419 tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
420 tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], "");
421
422 tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
423 max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, "");
424 }
425
426 /* Determine the number of emitted primitives per stream and fixup the
427 * GDS counter if necessary.
428 *
429 * This is complicated by the fact that a single stream can emit to
430 * multiple buffers (but luckily not vice versa).
431 */
432 LLVMValueRef emit_vgpr = ctx->ac.i32_0;
433
434 for (unsigned stream = 0; stream < 4; ++stream) {
435 if (!info->num_stream_output_components[stream])
436 continue;
437
438 tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
439 LLVMValueRef generated =
440 ac_build_readlane(&ctx->ac, tmp,
441 LLVMConstInt(ctx->ac.i32, stream, false));
442
443 LLVMValueRef emit = generated;
444 for (unsigned buffer = 0; buffer < 4; ++buffer) {
445 if (stream_for_buffer[buffer] == stream)
446 emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
447 }
448
449 emit_vgpr = ac_build_writelane(&ctx->ac, emit_vgpr, emit,
450 LLVMConstInt(ctx->ac.i32, stream, false));
451
452 /* Fixup the offset using a plain GDS atomic if we overflowed. */
453 tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
454 ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */
455 tmp = LLVMBuildLShr(builder,
456 LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
457 ac_get_thread_id(&ctx->ac), "");
458 tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
459 ac_build_ifcc(&ctx->ac, tmp, 5222);
460 {
461 tmp = LLVMBuildSub(builder, generated, emit, "");
462 tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
463 tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
464 LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
465 LLVMAtomicOrderingMonotonic, false);
466 }
467 ac_build_endif(&ctx->ac, 5222);
468 ac_build_endif(&ctx->ac, 5221);
469 }
470
471 tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
472 ac_build_ifcc(&ctx->ac, tmp, 5225);
473 {
474 tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac),
475 scratch_emit_basev, "");
476 tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
477 LLVMBuildStore(builder, emit_vgpr, tmp);
478 }
479 ac_build_endif(&ctx->ac, 5225);
480 }
481 ac_build_endif(&ctx->ac, 5200);
482
483 /* Determine the workgroup-relative per-thread / primitive offset into
484 * the streamout buffers */
485 struct ac_wg_scan primemit_scan[4] = {};
486
487 if (isgs) {
488 for (unsigned stream = 0; stream < 4; ++stream) {
489 if (!info->num_stream_output_components[stream])
490 continue;
491
492 primemit_scan[stream].enable_exclusive = true;
493 primemit_scan[stream].op = nir_op_iadd;
494 primemit_scan[stream].src = nggso->prim_enable[stream];
495 primemit_scan[stream].scratch =
496 ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
497 LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
498 primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
499 primemit_scan[stream].numwaves = get_tgsize(ctx);
500 primemit_scan[stream].maxwaves = 8;
501 ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
502 }
503 }
504
505 ac_build_s_barrier(&ctx->ac);
506
507 /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
508 LLVMValueRef wgoffset_dw[4] = {};
509
510 {
511 LLVMValueRef scratch_vgpr;
512
513 tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
514 scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
515
516 for (unsigned buffer = 0; buffer < 4; ++buffer) {
517 if (stream_for_buffer[buffer] >= 0) {
518 wgoffset_dw[buffer] = ac_build_readlane(
519 &ctx->ac, scratch_vgpr,
520 LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
521 }
522 }
523
524 for (unsigned stream = 0; stream < 4; ++stream) {
525 if (info->num_stream_output_components[stream]) {
526 nggso->emit[stream] = ac_build_readlane(
527 &ctx->ac, scratch_vgpr,
528 LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
529 }
530 }
531 }
532
533 /* Write out primitive data */
534 for (unsigned stream = 0; stream < 4; ++stream) {
535 if (!info->num_stream_output_components[stream])
536 continue;
537
538 if (isgs) {
539 ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
540 } else {
541 primemit_scan[stream].result_exclusive = tid;
542 }
543
544 tmp = LLVMBuildICmp(builder, LLVMIntULT,
545 primemit_scan[stream].result_exclusive,
546 nggso->emit[stream], "");
547 tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], "");
548 ac_build_ifcc(&ctx->ac, tmp, 5240);
549 {
550 LLVMValueRef offset_vtx =
551 LLVMBuildMul(builder, primemit_scan[stream].result_exclusive,
552 nggso->num_vertices, "");
553
554 for (unsigned i = 0; i < max_num_vertices; ++i) {
555 tmp = LLVMBuildICmp(builder, LLVMIntULT,
556 LLVMConstInt(ctx->ac.i32, i, false),
557 nggso->num_vertices, "");
558 ac_build_ifcc(&ctx->ac, tmp, 5241);
559 build_streamout_vertex(ctx, so_buffer, wgoffset_dw,
560 stream, offset_vtx, nggso->vertices[i]);
561 ac_build_endif(&ctx->ac, 5241);
562 offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
563 }
564 }
565 ac_build_endif(&ctx->ac, 5240);
566 }
567 }
568
569 /* LDS layout of ES vertex data for NGG culling. */
570 enum {
571 /* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old
572 * ES thread ID. After vertex compaction, compacted ES threads
573 * store the old thread ID here to copy input VGPRs from uncompacted
574 * ES threads.
575 * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
576 * Byte 2: TES rel patch ID
577 * Byte 3: Unused
578 */
579 lds_byte0_accept_flag = 0,
580 lds_byte0_old_thread_id = 0,
581 lds_byte1_new_thread_id,
582 lds_byte2_tes_rel_patch_id,
583 lds_byte3_unused,
584
585 lds_packed_data = 0, /* lds_byteN_... */
586
587 lds_pos_x,
588 lds_pos_y,
589 lds_pos_z,
590 lds_pos_w,
591 lds_pos_x_div_w,
592 lds_pos_y_div_w,
593 /* If VS: */
594 lds_vertex_id,
595 lds_instance_id, /* optional */
596 /* If TES: */
597 lds_tes_u = lds_vertex_id,
598 lds_tes_v = lds_instance_id,
599 lds_tes_patch_id, /* optional */
600 };
601
602 static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx,
603 LLVMValueRef ptr, unsigned byte_index)
604 {
605 assert(byte_index < 4);
606 LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
607 LLVMValueRef index = LLVMConstInt(ctx->ac.i32, byte_index, 0);
608
609 return LLVMBuildGEP(ctx->ac.builder,
610 LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""),
611 &index, 1, "");
612 }
613
614 static unsigned ngg_nogs_vertex_size(struct si_shader *shader)
615 {
616 unsigned lds_vertex_size = 0;
617
618 /* The edgeflag is always stored in the last element that's also
619 * used for padding to reduce LDS bank conflicts. */
620 if (shader->selector->so.num_outputs)
621 lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
622 if (shader->selector->info.writes_edgeflag)
623 lds_vertex_size = MAX2(lds_vertex_size, 1);
624
625 /* LDS size for passing data from GS to ES.
626 * GS stores Primitive IDs into LDS at the address corresponding
627 * to the ES thread of the provoking vertex. All ES threads
628 * load and export PrimitiveID for their thread.
629 */
630 if (shader->selector->type == PIPE_SHADER_VERTEX &&
631 shader->key.mono.u.vs_export_prim_id)
632 lds_vertex_size = MAX2(lds_vertex_size, 1);
633
634 if (shader->key.opt.ngg_culling) {
635 if (shader->selector->type == PIPE_SHADER_VERTEX) {
636 STATIC_ASSERT(lds_instance_id + 1 == 9);
637 lds_vertex_size = MAX2(lds_vertex_size, 9);
638 } else {
639 assert(shader->selector->type == PIPE_SHADER_TESS_EVAL);
640
641 if (shader->selector->info.uses_primid ||
642 shader->key.mono.u.vs_export_prim_id) {
643 STATIC_ASSERT(lds_tes_patch_id + 2 == 11);
644 lds_vertex_size = MAX2(lds_vertex_size, 11);
645 } else {
646 STATIC_ASSERT(lds_tes_v + 1 == 9);
647 lds_vertex_size = MAX2(lds_vertex_size, 9);
648 }
649 }
650 }
651
652 return lds_vertex_size;
653 }
654
655 /**
656 * Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
657 * for the vertex outputs.
658 */
659 static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx,
660 LLVMValueRef vtxid)
661 {
662 /* The extra dword is used to avoid LDS bank conflicts. */
663 unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader);
664 LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
665 LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
666 LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
667 return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
668 }
669
670 static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx,
671 LLVMValueRef ret, struct ac_arg param,
672 unsigned return_index)
673 {
674 LLVMValueRef v = ac_get_arg(&ctx->ac, param);
675
676 for (unsigned i = 0; i < 4; i++) {
677 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
678 ac_llvm_extract_elem(&ctx->ac, v, i),
679 return_index + i, "");
680 }
681 return ret;
682 }
683
684 static void load_bitmasks_2x64(struct si_shader_context *ctx,
685 LLVMValueRef lds_ptr, unsigned dw_offset,
686 LLVMValueRef mask[2], LLVMValueRef *total_bitcount)
687 {
688 LLVMBuilderRef builder = ctx->ac.builder;
689 LLVMValueRef ptr64 = LLVMBuildPointerCast(builder, lds_ptr,
690 LLVMPointerType(LLVMArrayType(ctx->ac.i64, 2),
691 AC_ADDR_SPACE_LDS), "");
692 for (unsigned i = 0; i < 2; i++) {
693 LLVMValueRef index = LLVMConstInt(ctx->ac.i32, dw_offset / 2 + i, 0);
694 mask[i] = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ptr64, index), "");
695 }
696
697 /* We get better code if we don't use the 128-bit bitcount. */
698 *total_bitcount = LLVMBuildAdd(builder, ac_build_bit_count(&ctx->ac, mask[0]),
699 ac_build_bit_count(&ctx->ac, mask[1]), "");
700 }
701
702 /**
703 * Given a total thread count, update total and per-wave thread counts in input SGPRs
704 * and return the per-wave thread count.
705 *
706 * \param new_num_threads Total thread count on the input, per-wave thread count on the output.
707 * \param tg_info tg_info SGPR value
708 * \param tg_info_num_bits the bit size of thread count field in tg_info
709 * \param tg_info_shift the bit offset of the thread count field in tg_info
710 * \param wave_info merged_wave_info SGPR value
711 * \param wave_info_num_bits the bit size of thread count field in merged_wave_info
712 * \param wave_info_shift the bit offset of the thread count field in merged_wave_info
713 */
714 static void update_thread_counts(struct si_shader_context *ctx,
715 LLVMValueRef *new_num_threads,
716 LLVMValueRef *tg_info,
717 unsigned tg_info_num_bits,
718 unsigned tg_info_shift,
719 LLVMValueRef *wave_info,
720 unsigned wave_info_num_bits,
721 unsigned wave_info_shift)
722 {
723 LLVMBuilderRef builder = ctx->ac.builder;
724
725 /* Update the total thread count. */
726 unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift);
727 *tg_info = LLVMBuildAnd(builder, *tg_info,
728 LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), "");
729 *tg_info = LLVMBuildOr(builder, *tg_info,
730 LLVMBuildShl(builder, *new_num_threads,
731 LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), "");
732
733 /* Update the per-wave thread count. */
734 LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
735 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), "");
736 *new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, "");
737 *new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0);
738 *new_num_threads = ac_build_imin(&ctx->ac, *new_num_threads,
739 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0));
740 unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift);
741 *wave_info = LLVMBuildAnd(builder, *wave_info,
742 LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), "");
743 *wave_info = LLVMBuildOr(builder, *wave_info,
744 LLVMBuildShl(builder, *new_num_threads,
745 LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""), "");
746 }
747
748 /**
749 * Cull primitives for NGG VS or TES, then compact vertices, which happens
750 * before the VS or TES main function. Return values for the main function.
751 * Also return the position, which is passed to the shader as an input,
752 * so that we don't compute it twice.
753 */
754 void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
755 unsigned max_outputs,
756 LLVMValueRef *addrs)
757 {
758 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
759 struct si_shader *shader = ctx->shader;
760 struct si_shader_selector *sel = shader->selector;
761 struct si_shader_info *info = &sel->info;
762 LLVMBuilderRef builder = ctx->ac.builder;
763
764 assert(shader->key.opt.ngg_culling);
765 assert(shader->key.as_ngg);
766 assert(sel->type == PIPE_SHADER_VERTEX ||
767 (sel->type == PIPE_SHADER_TESS_EVAL && !shader->key.as_es));
768
769 LLVMValueRef position[4] = {};
770 for (unsigned i = 0; i < info->num_outputs; i++) {
771 switch (info->output_semantic_name[i]) {
772 case TGSI_SEMANTIC_POSITION:
773 for (unsigned j = 0; j < 4; j++) {
774 position[j] = LLVMBuildLoad(ctx->ac.builder,
775 addrs[4 * i + j], "");
776 }
777 break;
778 }
779 }
780 assert(position[0]);
781
782 /* Store Position.XYZW into LDS. */
783 LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
784 for (unsigned chan = 0; chan < 4; chan++) {
785 LLVMBuildStore(builder, ac_to_integer(&ctx->ac, position[chan]),
786 ac_build_gep0(&ctx->ac, es_vtxptr,
787 LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
788 }
789 /* Store Position.XY / W into LDS. */
790 for (unsigned chan = 0; chan < 2; chan++) {
791 LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
792 LLVMBuildStore(builder, ac_to_integer(&ctx->ac, val),
793 ac_build_gep0(&ctx->ac, es_vtxptr,
794 LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0)));
795 }
796
797 /* Store VertexID and InstanceID. ES threads will have to load them
798 * from LDS after vertex compaction and use them instead of their own
799 * system values.
800 */
801 bool uses_instance_id = false;
802 bool uses_tes_prim_id = false;
803 LLVMValueRef packed_data = ctx->ac.i32_0;
804
805 if (ctx->type == PIPE_SHADER_VERTEX) {
806 uses_instance_id = sel->info.uses_instanceid ||
807 shader->key.part.vs.prolog.instance_divisor_is_one ||
808 shader->key.part.vs.prolog.instance_divisor_is_fetched;
809
810 LLVMBuildStore(builder, ctx->abi.vertex_id,
811 ac_build_gep0(&ctx->ac, es_vtxptr,
812 LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)));
813 if (uses_instance_id) {
814 LLVMBuildStore(builder, ctx->abi.instance_id,
815 ac_build_gep0(&ctx->ac, es_vtxptr,
816 LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)));
817 }
818 } else {
819 uses_tes_prim_id = sel->info.uses_primid ||
820 shader->key.mono.u.vs_export_prim_id;
821
822 assert(ctx->type == PIPE_SHADER_TESS_EVAL);
823 LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)),
824 ac_build_gep0(&ctx->ac, es_vtxptr,
825 LLVMConstInt(ctx->ac.i32, lds_tes_u, 0)));
826 LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)),
827 ac_build_gep0(&ctx->ac, es_vtxptr,
828 LLVMConstInt(ctx->ac.i32, lds_tes_v, 0)));
829 packed_data = LLVMBuildShl(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id),
830 LLVMConstInt(ctx->ac.i32, lds_byte2_tes_rel_patch_id * 8, 0), "");
831 if (uses_tes_prim_id) {
832 LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
833 ac_build_gep0(&ctx->ac, es_vtxptr,
834 LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)));
835 }
836 }
837 /* Initialize the packed data. */
838 LLVMBuildStore(builder, packed_data,
839 ac_build_gep0(&ctx->ac, es_vtxptr,
840 LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
841 ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
842
843 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
844
845 /* Initialize the last 3 gs_ngg_scratch dwords to 0, because we may have less
846 * than 4 waves, but we always read all 4 values. This is where the thread
847 * bitmasks of unculled threads will be stored.
848 *
849 * gs_ngg_scratch layout: esmask[0..3]
850 */
851 ac_build_ifcc(&ctx->ac,
852 LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx),
853 LLVMConstInt(ctx->ac.i32, 3, 0), ""), 16101);
854 {
855 LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->ac.i32_1, "");
856 LLVMBuildStore(builder, ctx->ac.i32_0,
857 ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, index));
858 }
859 ac_build_endif(&ctx->ac, 16101);
860 ac_build_s_barrier(&ctx->ac);
861
862 /* The hardware requires that there are no holes between unculled vertices,
863 * which means we have to pack ES threads, i.e. reduce the ES thread count
864 * and move ES input VGPRs to lower threads. The upside is that varyings
865 * are only fetched and computed for unculled vertices.
866 *
867 * Vertex compaction in GS threads:
868 *
869 * Part 1: Compute the surviving vertex mask in GS threads:
870 * - Compute 4 32-bit surviving vertex masks in LDS. (max 4 waves)
871 * - In GS, notify ES threads whether the vertex survived.
872 * - Barrier
873 * - ES threads will create the mask and store it in LDS.
874 * - Barrier
875 * - Each GS thread loads the vertex masks from LDS.
876 *
877 * Part 2: Compact ES threads in GS threads:
878 * - Compute the prefix sum for all 3 vertices from the masks. These are the new
879 * thread IDs for each vertex within the primitive.
880 * - Write the value of the old thread ID into the LDS address of the new thread ID.
881 * The ES thread will load the old thread ID and use it to load the position, VertexID,
882 * and InstanceID.
883 * - Update vertex indices and null flag in the GS input VGPRs.
884 * - Barrier
885 *
886 * Part 3: Update inputs GPRs
887 * - For all waves, update per-wave thread counts in input SGPRs.
888 * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
889 */
890
891 LLVMValueRef vtxindex[3];
892 if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
893 /* For the GS fast launch, the VS prologs simply puts the Vertex IDs
894 * into these VGPRs.
895 */
896 vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
897 vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
898 vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
899 } else {
900 vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
901 vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
902 vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
903 };
904 LLVMValueRef gs_vtxptr[] = {
905 ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
906 ngg_nogs_vertex_ptr(ctx, vtxindex[1]),
907 ngg_nogs_vertex_ptr(ctx, vtxindex[2]),
908 };
909 es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
910
911 LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
912
913 /* Do culling in GS threads. */
914 ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002);
915 {
916 /* Load positions. */
917 LLVMValueRef pos[3][4] = {};
918 for (unsigned vtx = 0; vtx < 3; vtx++) {
919 for (unsigned chan = 0; chan < 4; chan++) {
920 unsigned index;
921 if (chan == 0 || chan == 1)
922 index = lds_pos_x_div_w + chan;
923 else if (chan == 3)
924 index = lds_pos_w;
925 else
926 continue;
927
928 LLVMValueRef addr = ac_build_gep0(&ctx->ac, gs_vtxptr[vtx],
929 LLVMConstInt(ctx->ac.i32, index, 0));
930 pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
931 pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
932 }
933 }
934
935 /* Load the viewport state for small prim culling. */
936 LLVMValueRef vp = ac_build_load_invariant(&ctx->ac,
937 ac_get_arg(&ctx->ac, ctx->small_prim_cull_info),
938 ctx->ac.i32_0);
939 vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
940 LLVMValueRef vp_scale[2], vp_translate[2];
941 vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
942 vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
943 vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
944 vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
945
946 /* Get the small prim filter precision. */
947 LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4);
948 small_prim_precision = LLVMBuildOr(builder, small_prim_precision,
949 LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
950 small_prim_precision = LLVMBuildShl(builder, small_prim_precision,
951 LLVMConstInt(ctx->ac.i32, 23, 0), "");
952 small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
953
954 /* Execute culling code. */
955 struct ac_cull_options options = {};
956 options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
957 options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
958 options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS;
959 options.cull_small_prims = options.cull_view_xy;
960 options.cull_zero_area = options.cull_front || options.cull_back;
961 options.cull_w = true;
962
963 /* Tell ES threads whether their vertex survived. */
964 ac_build_ifcc(&ctx->ac, ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true,
965 vp_scale, vp_translate,
966 small_prim_precision, &options), 16003);
967 {
968 LLVMBuildStore(builder, ctx->ac.i32_1, gs_accepted);
969 for (unsigned vtx = 0; vtx < 3; vtx++) {
970 LLVMBuildStore(builder, ctx->ac.i8_1,
971 si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag));
972 }
973 }
974 ac_build_endif(&ctx->ac, 16003);
975 }
976 ac_build_endif(&ctx->ac, 16002);
977 ac_build_s_barrier(&ctx->ac);
978
979 gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
980
981 LLVMValueRef es_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, "");
982
983 /* Convert the per-vertex flag to a thread bitmask in ES threads and store it in LDS. */
984 ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007);
985 {
986 LLVMValueRef es_accepted_flag =
987 LLVMBuildLoad(builder,
988 si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), "");
989
990 LLVMValueRef es_accepted_bool = LLVMBuildICmp(builder, LLVMIntNE,
991 es_accepted_flag, ctx->ac.i8_0, "");
992 LLVMValueRef es_mask = ac_get_i1_sgpr_mask(&ctx->ac, es_accepted_bool);
993
994 LLVMBuildStore(builder, es_accepted_bool, es_accepted);
995
996 ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ,
997 tid, ctx->ac.i32_0, ""), 16008);
998 {
999 LLVMBuildStore(builder, es_mask,
1000 ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
1001 get_wave_id_in_tg(ctx)));
1002 }
1003 ac_build_endif(&ctx->ac, 16008);
1004 }
1005 ac_build_endif(&ctx->ac, 16007);
1006 ac_build_s_barrier(&ctx->ac);
1007
1008 /* Load the vertex masks and compute the new ES thread count. */
1009 LLVMValueRef es_mask[2], new_num_es_threads, kill_wave;
1010 load_bitmasks_2x64(ctx, ctx->gs_ngg_scratch, 0, es_mask, &new_num_es_threads);
1011 new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL);
1012
1013 /* ES threads compute their prefix sum, which is the new ES thread ID.
1014 * Then they write the value of the old thread ID into the LDS address
1015 * of the new thread ID. It will be used it to load input VGPRs from
1016 * the old thread's LDS location.
1017 */
1018 ac_build_ifcc(&ctx->ac, LLVMBuildLoad(builder, es_accepted, ""), 16009);
1019 {
1020 LLVMValueRef old_id = get_thread_id_in_tg(ctx);
1021 LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id);
1022
1023 LLVMBuildStore(builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""),
1024 si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id),
1025 lds_byte0_old_thread_id));
1026 LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
1027 si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
1028 }
1029 ac_build_endif(&ctx->ac, 16009);
1030
1031 /* Kill waves that have inactive threads. */
1032 kill_wave = LLVMBuildICmp(builder, LLVMIntULE,
1033 ac_build_imax(&ctx->ac, new_num_es_threads, ngg_get_prim_cnt(ctx)),
1034 LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
1035 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""), "");
1036 ac_build_ifcc(&ctx->ac, kill_wave, 19202);
1037 {
1038 /* If we are killing wave 0, send that there are no primitives
1039 * in this threadgroup.
1040 */
1041 ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
1042 ctx->ac.i32_0, ctx->ac.i32_0);
1043 ac_build_s_endpgm(&ctx->ac);
1044 }
1045 ac_build_endif(&ctx->ac, 19202);
1046 ac_build_s_barrier(&ctx->ac);
1047
1048 /* Send the final vertex and primitive counts. */
1049 ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
1050 new_num_es_threads, ngg_get_prim_cnt(ctx));
1051
1052 /* Update thread counts in SGPRs. */
1053 LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->gs_tg_info);
1054 LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->merged_wave_info);
1055
1056 /* This also converts the thread count from the total count to the per-wave count. */
1057 update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12,
1058 &new_merged_wave_info, 8, 0);
1059
1060 /* Update vertex indices in VGPR0 (same format as NGG passthrough). */
1061 LLVMValueRef new_vgpr0 = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1062
1063 /* Set the null flag at the beginning (culled), and then
1064 * overwrite it for accepted primitives.
1065 */
1066 LLVMBuildStore(builder, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), new_vgpr0);
1067
1068 /* Get vertex indices after vertex compaction. */
1069 ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011);
1070 {
1071 struct ac_ngg_prim prim = {};
1072 prim.num_vertices = 3;
1073 prim.isnull = ctx->ac.i1false;
1074
1075 for (unsigned vtx = 0; vtx < 3; vtx++) {
1076 prim.index[vtx] =
1077 LLVMBuildLoad(builder,
1078 si_build_gep_i8(ctx, gs_vtxptr[vtx],
1079 lds_byte1_new_thread_id), "");
1080 prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, "");
1081 prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx);
1082 }
1083
1084 /* Set the new GS input VGPR. */
1085 LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0);
1086 }
1087 ac_build_endif(&ctx->ac, 16011);
1088
1089 if (gfx10_ngg_export_prim_early(shader))
1090 gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, ""));
1091
1092 /* Set the new ES input VGPRs. */
1093 LLVMValueRef es_data[4];
1094 LLVMValueRef old_thread_id = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1095
1096 for (unsigned i = 0; i < 4; i++)
1097 es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1098
1099 ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid,
1100 new_num_es_threads, ""), 16012);
1101 {
1102 LLVMValueRef old_id, old_es_vtxptr, tmp;
1103
1104 /* Load ES input VGPRs from the ES thread before compaction. */
1105 old_id = LLVMBuildLoad(builder,
1106 si_build_gep_i8(ctx, es_vtxptr, lds_byte0_old_thread_id), "");
1107 old_id = LLVMBuildZExt(builder, old_id, ctx->ac.i32, "");
1108
1109 LLVMBuildStore(builder, old_id, old_thread_id);
1110 old_es_vtxptr = ngg_nogs_vertex_ptr(ctx, old_id);
1111
1112 for (unsigned i = 0; i < 2; i++) {
1113 tmp = LLVMBuildLoad(builder,
1114 ac_build_gep0(&ctx->ac, old_es_vtxptr,
1115 LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)), "");
1116 LLVMBuildStore(builder, tmp, es_data[i]);
1117 }
1118
1119 if (ctx->type == PIPE_SHADER_TESS_EVAL) {
1120 tmp = LLVMBuildLoad(builder,
1121 si_build_gep_i8(ctx, old_es_vtxptr,
1122 lds_byte2_tes_rel_patch_id), "");
1123 tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
1124 LLVMBuildStore(builder, tmp, es_data[2]);
1125
1126 if (uses_tes_prim_id) {
1127 tmp = LLVMBuildLoad(builder,
1128 ac_build_gep0(&ctx->ac, old_es_vtxptr,
1129 LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)), "");
1130 LLVMBuildStore(builder, tmp, es_data[3]);
1131 }
1132 }
1133 }
1134 ac_build_endif(&ctx->ac, 16012);
1135
1136 /* Return values for the main function. */
1137 LLVMValueRef ret = ctx->return_value;
1138 LLVMValueRef val;
1139
1140 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, "");
1141 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, "");
1142 if (ctx->type == PIPE_SHADER_TESS_EVAL)
1143 ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 4);
1144
1145 ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
1146 8 + SI_SGPR_RW_BUFFERS);
1147 ret = si_insert_input_ptr(ctx, ret,
1148 ctx->bindless_samplers_and_images,
1149 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
1150 ret = si_insert_input_ptr(ctx, ret,
1151 ctx->const_and_shader_buffers,
1152 8 + SI_SGPR_CONST_AND_SHADER_BUFFERS);
1153 ret = si_insert_input_ptr(ctx, ret,
1154 ctx->samplers_and_images,
1155 8 + SI_SGPR_SAMPLERS_AND_IMAGES);
1156 ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits,
1157 8 + SI_SGPR_VS_STATE_BITS);
1158
1159 if (ctx->type == PIPE_SHADER_VERTEX) {
1160 ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex,
1161 8 + SI_SGPR_BASE_VERTEX);
1162 ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance,
1163 8 + SI_SGPR_START_INSTANCE);
1164 ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id,
1165 8 + SI_SGPR_DRAWID);
1166 ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers,
1167 8 + SI_VS_NUM_USER_SGPR);
1168
1169 for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
1170 ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
1171 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
1172 }
1173 } else {
1174 assert(ctx->type == PIPE_SHADER_TESS_EVAL);
1175 ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout,
1176 8 + SI_SGPR_TES_OFFCHIP_LAYOUT);
1177 ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr,
1178 8 + SI_SGPR_TES_OFFCHIP_ADDR);
1179 }
1180
1181 unsigned vgpr;
1182 if (ctx->type == PIPE_SHADER_VERTEX) {
1183 if (shader->selector->num_vbos_in_user_sgprs) {
1184 vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
1185 shader->selector->num_vbos_in_user_sgprs * 4;
1186 } else {
1187 vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
1188 }
1189 } else {
1190 vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
1191 }
1192
1193 val = LLVMBuildLoad(builder, new_vgpr0, "");
1194 ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
1195 vgpr++, "");
1196 vgpr++; /* gs_vtx23_offset */
1197
1198 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
1199 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
1200 vgpr++; /* gs_vtx45_offset */
1201
1202 if (ctx->type == PIPE_SHADER_VERTEX) {
1203 val = LLVMBuildLoad(builder, es_data[0], "");
1204 ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
1205 vgpr++, ""); /* VGPR5 - VertexID */
1206 vgpr += 2;
1207 if (uses_instance_id) {
1208 val = LLVMBuildLoad(builder, es_data[1], "");
1209 ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
1210 vgpr++, ""); /* VGPR8 - InstanceID */
1211 } else {
1212 vgpr++;
1213 }
1214 } else {
1215 assert(ctx->type == PIPE_SHADER_TESS_EVAL);
1216 unsigned num_vgprs = uses_tes_prim_id ? 4 : 3;
1217 for (unsigned i = 0; i < num_vgprs; i++) {
1218 val = LLVMBuildLoad(builder, es_data[i], "");
1219 ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
1220 vgpr++, "");
1221 }
1222 if (num_vgprs == 3)
1223 vgpr++;
1224 }
1225 /* Return the old thread ID. */
1226 val = LLVMBuildLoad(builder, old_thread_id, "");
1227 ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
1228
1229 /* These two also use LDS. */
1230 if (sel->info.writes_edgeflag ||
1231 (ctx->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
1232 ac_build_s_barrier(&ctx->ac);
1233
1234 ctx->return_value = ret;
1235 }
1236
1237 /**
1238 * Emit the epilogue of an API VS or TES shader compiled as ESGS shader.
1239 */
1240 void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
1241 unsigned max_outputs,
1242 LLVMValueRef *addrs)
1243 {
1244 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1245 struct si_shader_selector *sel = ctx->shader->selector;
1246 struct si_shader_info *info = &sel->info;
1247 struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
1248 LLVMBuilderRef builder = ctx->ac.builder;
1249 LLVMValueRef tmp, tmp2;
1250
1251 assert(!ctx->shader->is_gs_copy_shader);
1252 assert(info->num_outputs <= max_outputs);
1253
1254 LLVMValueRef vertex_ptr = NULL;
1255
1256 if (sel->so.num_outputs || sel->info.writes_edgeflag)
1257 vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
1258
1259 for (unsigned i = 0; i < info->num_outputs; i++) {
1260 outputs[i].semantic_name = info->output_semantic_name[i];
1261 outputs[i].semantic_index = info->output_semantic_index[i];
1262
1263 for (unsigned j = 0; j < 4; j++) {
1264 outputs[i].vertex_stream[j] =
1265 (info->output_streams[i] >> (2 * j)) & 3;
1266
1267 /* TODO: we may store more outputs than streamout needs,
1268 * but streamout performance isn't that important.
1269 */
1270 if (sel->so.num_outputs) {
1271 tmp = ac_build_gep0(&ctx->ac, vertex_ptr,
1272 LLVMConstInt(ctx->ac.i32, 4 * i + j, false));
1273 tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], "");
1274 tmp2 = ac_to_integer(&ctx->ac, tmp2);
1275 LLVMBuildStore(builder, tmp2, tmp);
1276 }
1277 }
1278
1279 /* Store the edgeflag at the end (if streamout is enabled) */
1280 if (info->output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG &&
1281 sel->info.writes_edgeflag) {
1282 LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], "");
1283 /* The output is a float, but the hw expects a 1-bit integer. */
1284 edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, "");
1285 edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1);
1286
1287 tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
1288 tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
1289 LLVMBuildStore(builder, edgeflag, tmp);
1290 }
1291 }
1292
1293 bool unterminated_es_if_block =
1294 !sel->so.num_outputs &&
1295 !sel->info.writes_edgeflag &&
1296 !ctx->screen->use_ngg_streamout && /* no query buffer */
1297 (ctx->type != PIPE_SHADER_VERTEX ||
1298 !ctx->shader->key.mono.u.vs_export_prim_id);
1299
1300 if (!unterminated_es_if_block)
1301 ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
1302
1303 LLVMValueRef is_gs_thread = si_is_gs_thread(ctx);
1304 LLVMValueRef is_es_thread = si_is_es_thread(ctx);
1305 LLVMValueRef vtxindex[3];
1306
1307 if (ctx->shader->key.opt.ngg_culling) {
1308 vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9);
1309 vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9);
1310 vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9);
1311 } else {
1312 vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
1313 vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
1314 vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
1315 }
1316
1317 /* Determine the number of vertices per primitive. */
1318 unsigned num_vertices;
1319 LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices);
1320
1321 /* Streamout */
1322 LLVMValueRef emitted_prims = NULL;
1323
1324 if (sel->so.num_outputs) {
1325 assert(!unterminated_es_if_block);
1326
1327 struct ngg_streamout nggso = {};
1328 nggso.num_vertices = num_vertices_val;
1329 nggso.prim_enable[0] = is_gs_thread;
1330
1331 for (unsigned i = 0; i < num_vertices; ++i)
1332 nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1333
1334 build_streamout(ctx, &nggso);
1335 emitted_prims = nggso.emit[0];
1336 }
1337
1338 LLVMValueRef user_edgeflags[3] = {};
1339
1340 if (sel->info.writes_edgeflag) {
1341 assert(!unterminated_es_if_block);
1342
1343 /* Streamout already inserted the barrier, so don't insert it again. */
1344 if (!sel->so.num_outputs)
1345 ac_build_s_barrier(&ctx->ac);
1346
1347 ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1348 /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
1349 for (unsigned i = 0; i < num_vertices; i++) {
1350 tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1351 tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
1352 tmp = ac_build_gep0(&ctx->ac, tmp, tmp2);
1353 tmp = LLVMBuildLoad(builder, tmp, "");
1354 tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1355
1356 user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, "");
1357 LLVMBuildStore(builder, tmp, user_edgeflags[i]);
1358 }
1359 ac_build_endif(&ctx->ac, 5400);
1360 }
1361
1362 /* Copy Primitive IDs from GS threads to the LDS address corresponding
1363 * to the ES thread of the provoking vertex.
1364 */
1365 if (ctx->type == PIPE_SHADER_VERTEX &&
1366 ctx->shader->key.mono.u.vs_export_prim_id) {
1367 assert(!unterminated_es_if_block);
1368
1369 /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
1370 if (sel->so.num_outputs || sel->info.writes_edgeflag)
1371 ac_build_s_barrier(&ctx->ac);
1372
1373 ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1374 /* Extract the PROVOKING_VTX_INDEX field. */
1375 LLVMValueRef provoking_vtx_in_prim =
1376 si_unpack_param(ctx, ctx->vs_state_bits, 4, 2);
1377
1378 /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
1379 LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
1380 LLVMValueRef provoking_vtx_index =
1381 LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
1382 LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index);
1383
1384 LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id),
1385 ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0));
1386 ac_build_endif(&ctx->ac, 5400);
1387 }
1388
1389 /* Update query buffer */
1390 if (ctx->screen->use_ngg_streamout &&
1391 !info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
1392 assert(!unterminated_es_if_block);
1393
1394 tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
1395 tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1396 ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */
1397 tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
1398 ac_build_ifcc(&ctx->ac, tmp, 5030);
1399 tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac),
1400 sel->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, "");
1401 ac_build_ifcc(&ctx->ac, tmp, 5031);
1402 {
1403 LLVMValueRef args[] = {
1404 ngg_get_prim_cnt(ctx),
1405 ngg_get_query_buf(ctx),
1406 LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */
1407 ctx->ac.i32_0, /* soffset */
1408 ctx->ac.i32_0, /* cachepolicy */
1409 };
1410
1411 if (sel->so.num_outputs) {
1412 args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1);
1413 args[2] = ac_build_writelane(&ctx->ac, args[2],
1414 LLVMConstInt(ctx->ac.i32, 24, false), ctx->ac.i32_1);
1415 }
1416
1417 /* TODO: should this be 64-bit atomics? */
1418 ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
1419 ctx->ac.i32, args, 5, 0);
1420 }
1421 ac_build_endif(&ctx->ac, 5031);
1422 ac_build_endif(&ctx->ac, 5030);
1423 ac_build_endif(&ctx->ac, 5029);
1424 }
1425
1426 /* Build the primitive export. */
1427 if (!gfx10_ngg_export_prim_early(ctx->shader)) {
1428 assert(!unterminated_es_if_block);
1429 gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL);
1430 }
1431
1432 /* Export per-vertex data (positions and parameters). */
1433 if (!unterminated_es_if_block)
1434 ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
1435 {
1436 unsigned i;
1437
1438 /* Unconditionally (re-)load the values for proper SSA form. */
1439 for (i = 0; i < info->num_outputs; i++) {
1440 /* If the NGG cull shader part computed the position, don't
1441 * use the position from the current shader part. Instead,
1442 * load it from LDS.
1443 */
1444 if (info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
1445 ctx->shader->key.opt.ngg_culling) {
1446 vertex_ptr = ngg_nogs_vertex_ptr(ctx,
1447 ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id));
1448
1449 for (unsigned j = 0; j < 4; j++) {
1450 tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0);
1451 tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
1452 tmp = LLVMBuildLoad(builder, tmp, "");
1453 outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
1454 }
1455 } else {
1456 for (unsigned j = 0; j < 4; j++) {
1457 outputs[i].values[j] =
1458 LLVMBuildLoad(builder,
1459 addrs[4 * i + j], "");
1460 }
1461 }
1462 }
1463
1464 if (ctx->shader->key.mono.u.vs_export_prim_id) {
1465 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
1466 outputs[i].semantic_index = 0;
1467
1468 if (ctx->type == PIPE_SHADER_VERTEX) {
1469 /* Wait for GS stores to finish. */
1470 ac_build_s_barrier(&ctx->ac);
1471
1472 tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
1473 tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
1474 outputs[i].values[0] = LLVMBuildLoad(builder, tmp, "");
1475 } else {
1476 assert(ctx->type == PIPE_SHADER_TESS_EVAL);
1477 outputs[i].values[0] = si_get_primitive_id(ctx, 0);
1478 }
1479
1480 outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]);
1481 for (unsigned j = 1; j < 4; j++)
1482 outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32);
1483
1484 memset(outputs[i].vertex_stream, 0,
1485 sizeof(outputs[i].vertex_stream));
1486 i++;
1487 }
1488
1489 si_llvm_build_vs_exports(ctx, outputs, i);
1490 }
1491 ac_build_endif(&ctx->ac, 6002);
1492 }
1493
1494 static LLVMValueRef
1495 ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
1496 {
1497 const struct si_shader_selector *sel = ctx->shader->selector;
1498 const struct si_shader_info *info = &sel->info;
1499
1500 LLVMTypeRef elements[2] = {
1501 LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
1502 LLVMArrayType(ctx->ac.i8, 4),
1503 };
1504 LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
1505 type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
1506 return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
1507 }
1508
1509 /**
1510 * Return a pointer to the LDS storage reserved for the N'th vertex, where N
1511 * is in emit order; that is:
1512 * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
1513 * - during vertex emit, i.e. while the API GS shader invocation is running,
1514 * N = threadidx * gs_max_out_vertices + emitidx
1515 *
1516 * Goals of the LDS memory layout:
1517 * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
1518 * in uniform control flow
1519 * 2. Eliminate bank conflicts on read for export if, additionally, there is no
1520 * culling
1521 * 3. Agnostic to the number of waves (since we don't know it before compiling)
1522 * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
1523 * 5. Avoid wasting memory.
1524 *
1525 * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
1526 * layout, elimination of bank conflicts requires that each vertex occupy an
1527 * odd number of dwords. We use the additional dword to store the output stream
1528 * index as well as a flag to indicate whether this vertex ends a primitive
1529 * for rasterization.
1530 *
1531 * Swizzling is required to satisfy points 1 and 2 simultaneously.
1532 *
1533 * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
1534 * Indices are swizzled in groups of 32, which ensures point 1 without
1535 * disturbing point 2.
1536 *
1537 * \return an LDS pointer to type {[N x i32], [4 x i8]}
1538 */
1539 static LLVMValueRef
1540 ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
1541 {
1542 struct si_shader_selector *sel = ctx->shader->selector;
1543 LLVMBuilderRef builder = ctx->ac.builder;
1544 LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
1545
1546 /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
1547 unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1;
1548 if (write_stride_2exp) {
1549 LLVMValueRef row =
1550 LLVMBuildLShr(builder, vertexidx,
1551 LLVMConstInt(ctx->ac.i32, 5, false), "");
1552 LLVMValueRef swizzle =
1553 LLVMBuildAnd(builder, row,
1554 LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1,
1555 false), "");
1556 vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
1557 }
1558
1559 return ac_build_gep0(&ctx->ac, storage, vertexidx);
1560 }
1561
1562 static LLVMValueRef
1563 ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
1564 LLVMValueRef emitidx)
1565 {
1566 struct si_shader_selector *sel = ctx->shader->selector;
1567 LLVMBuilderRef builder = ctx->ac.builder;
1568 LLVMValueRef tmp;
1569
1570 tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false);
1571 tmp = LLVMBuildMul(builder, tmp, gsthread, "");
1572 const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
1573 return ngg_gs_vertex_ptr(ctx, vertexidx);
1574 }
1575
1576 static LLVMValueRef
1577 ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx, LLVMValueRef vertexptr,
1578 unsigned out_idx)
1579 {
1580 LLVMValueRef gep_idx[3] = {
1581 ctx->ac.i32_0, /* implied C-style array */
1582 ctx->ac.i32_0, /* first struct entry */
1583 LLVMConstInt(ctx->ac.i32, out_idx, false),
1584 };
1585 return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1586 }
1587
1588 static LLVMValueRef
1589 ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx, LLVMValueRef vertexptr,
1590 unsigned stream)
1591 {
1592 LLVMValueRef gep_idx[3] = {
1593 ctx->ac.i32_0, /* implied C-style array */
1594 ctx->ac.i32_1, /* second struct entry */
1595 LLVMConstInt(ctx->ac.i32, stream, false),
1596 };
1597 return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1598 }
1599
1600 void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
1601 unsigned stream,
1602 LLVMValueRef *addrs)
1603 {
1604 const struct si_shader_selector *sel = ctx->shader->selector;
1605 const struct si_shader_info *info = &sel->info;
1606 LLVMBuilderRef builder = ctx->ac.builder;
1607 LLVMValueRef tmp;
1608 const LLVMValueRef vertexidx =
1609 LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1610
1611 /* If this thread has already emitted the declared maximum number of
1612 * vertices, skip the write: excessive vertex emissions are not
1613 * supposed to have any effect.
1614 */
1615 const LLVMValueRef can_emit =
1616 LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
1617 LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
1618
1619 tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1620 tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
1621 LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1622
1623 ac_build_ifcc(&ctx->ac, can_emit, 9001);
1624
1625 const LLVMValueRef vertexptr =
1626 ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
1627 unsigned out_idx = 0;
1628 for (unsigned i = 0; i < info->num_outputs; i++) {
1629 for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
1630 if (!(info->output_usagemask[i] & (1 << chan)) ||
1631 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
1632 continue;
1633
1634 LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], "");
1635 out_val = ac_to_integer(&ctx->ac, out_val);
1636 LLVMBuildStore(builder, out_val,
1637 ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
1638 }
1639 }
1640 assert(out_idx * 4 == sel->gsvs_vertex_size);
1641
1642 /* Determine and store whether this vertex completed a primitive. */
1643 const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
1644
1645 tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false);
1646 const LLVMValueRef iscompleteprim =
1647 LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
1648
1649 /* Since the geometry shader emits triangle strips, we need to
1650 * track which primitive is odd and swap vertex indices to get
1651 * the correct vertex order.
1652 */
1653 LLVMValueRef is_odd = ctx->ac.i1false;
1654 if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) {
1655 tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
1656 is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
1657 }
1658
1659 tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
1660 LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
1661
1662 /* The per-vertex primitive flag encoding:
1663 * bit 0: whether this vertex finishes a primitive
1664 * bit 1: whether the primitive is odd (if we are emitting triangle strips)
1665 */
1666 tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
1667 tmp = LLVMBuildOr(builder, tmp,
1668 LLVMBuildShl(builder,
1669 LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""),
1670 ctx->ac.i8_1, ""), "");
1671 LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
1672
1673 tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1674 tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
1675 LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
1676
1677 ac_build_endif(&ctx->ac, 9001);
1678 }
1679
1680 void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
1681 {
1682 /* Zero out the part of LDS scratch that is used to accumulate the
1683 * per-stream generated primitive count.
1684 */
1685 LLVMBuilderRef builder = ctx->ac.builder;
1686 LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
1687 LLVMValueRef tid = get_thread_id_in_tg(ctx);
1688 LLVMValueRef tmp;
1689
1690 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
1691 ac_build_ifcc(&ctx->ac, tmp, 5090);
1692 {
1693 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
1694 LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
1695 }
1696 ac_build_endif(&ctx->ac, 5090);
1697
1698 ac_build_s_barrier(&ctx->ac);
1699 }
1700
1701 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
1702 {
1703 const struct si_shader_selector *sel = ctx->shader->selector;
1704 const struct si_shader_info *info = &sel->info;
1705 const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim);
1706 LLVMBuilderRef builder = ctx->ac.builder;
1707 LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
1708 LLVMValueRef tmp, tmp2;
1709
1710 /* Zero out remaining (non-emitted) primitive flags.
1711 *
1712 * Note: Alternatively, we could pass the relevant gs_next_vertex to
1713 * the emit threads via LDS. This is likely worse in the expected
1714 * typical case where each GS thread emits the full set of
1715 * vertices.
1716 */
1717 for (unsigned stream = 0; stream < 4; ++stream) {
1718 if (!info->num_stream_output_components[stream])
1719 continue;
1720
1721 const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
1722
1723 ac_build_bgnloop(&ctx->ac, 5100);
1724
1725 const LLVMValueRef vertexidx =
1726 LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1727 tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
1728 LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
1729 ac_build_ifcc(&ctx->ac, tmp, 5101);
1730 ac_build_break(&ctx->ac);
1731 ac_build_endif(&ctx->ac, 5101);
1732
1733 tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1734 LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1735
1736 tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
1737 LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
1738
1739 ac_build_endloop(&ctx->ac, 5100);
1740 }
1741
1742 /* Accumulate generated primitives counts across the entire threadgroup. */
1743 for (unsigned stream = 0; stream < 4; ++stream) {
1744 if (!info->num_stream_output_components[stream])
1745 continue;
1746
1747 LLVMValueRef numprims =
1748 LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1749 numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
1750
1751 tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
1752 ac_build_ifcc(&ctx->ac, tmp, 5105);
1753 {
1754 LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
1755 ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
1756 LLVMConstInt(ctx->ac.i32, stream, false)),
1757 numprims, LLVMAtomicOrderingMonotonic, false);
1758 }
1759 ac_build_endif(&ctx->ac, 5105);
1760 }
1761
1762 ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
1763
1764 ac_build_s_barrier(&ctx->ac);
1765
1766 const LLVMValueRef tid = get_thread_id_in_tg(ctx);
1767 LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
1768
1769 /* Streamout */
1770 if (sel->so.num_outputs) {
1771 struct ngg_streamout nggso = {};
1772
1773 nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
1774
1775 LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
1776 for (unsigned stream = 0; stream < 4; ++stream) {
1777 if (!info->num_stream_output_components[stream])
1778 continue;
1779
1780 tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
1781 tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1782 tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1783 nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
1784 }
1785
1786 for (unsigned i = 0; i < verts_per_prim; ++i) {
1787 tmp = LLVMBuildSub(builder, tid,
1788 LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
1789 tmp = ngg_gs_vertex_ptr(ctx, tmp);
1790 nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
1791 }
1792
1793 build_streamout(ctx, &nggso);
1794 }
1795
1796 /* Write shader query data. */
1797 if (ctx->screen->use_ngg_streamout) {
1798 tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
1799 tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1800 ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */
1801 unsigned num_query_comps = sel->so.num_outputs ? 8 : 4;
1802 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
1803 LLVMConstInt(ctx->ac.i32, num_query_comps, false), "");
1804 ac_build_ifcc(&ctx->ac, tmp, 5110);
1805 {
1806 LLVMValueRef offset;
1807 tmp = tid;
1808 if (sel->so.num_outputs)
1809 tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), "");
1810 offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), "");
1811 if (sel->so.num_outputs) {
1812 tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), "");
1813 tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), "");
1814 offset = LLVMBuildAdd(builder, offset, tmp, "");
1815 }
1816
1817 tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
1818 LLVMValueRef args[] = {
1819 tmp,
1820 ngg_get_query_buf(ctx),
1821 offset,
1822 LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */
1823 ctx->ac.i32_0, /* cachepolicy */
1824 };
1825 ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
1826 ctx->ac.i32, args, 5, 0);
1827 }
1828 ac_build_endif(&ctx->ac, 5110);
1829 ac_build_endif(&ctx->ac, 5109);
1830 }
1831
1832 /* Determine vertex liveness. */
1833 LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
1834
1835 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1836 ac_build_ifcc(&ctx->ac, tmp, 5120);
1837 {
1838 for (unsigned i = 0; i < verts_per_prim; ++i) {
1839 const LLVMValueRef primidx =
1840 LLVMBuildAdd(builder, tid,
1841 LLVMConstInt(ctx->ac.i32, i, false), "");
1842
1843 if (i > 0) {
1844 tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
1845 ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
1846 }
1847
1848 /* Load primitive liveness */
1849 tmp = ngg_gs_vertex_ptr(ctx, primidx);
1850 tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1851 const LLVMValueRef primlive =
1852 LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1853
1854 tmp = LLVMBuildLoad(builder, vertliveptr, "");
1855 tmp = LLVMBuildOr(builder, tmp, primlive, ""),
1856 LLVMBuildStore(builder, tmp, vertliveptr);
1857
1858 if (i > 0)
1859 ac_build_endif(&ctx->ac, 5121 + i);
1860 }
1861 }
1862 ac_build_endif(&ctx->ac, 5120);
1863
1864 /* Inclusive scan addition across the current wave. */
1865 LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
1866 struct ac_wg_scan vertlive_scan = {};
1867 vertlive_scan.op = nir_op_iadd;
1868 vertlive_scan.enable_reduce = true;
1869 vertlive_scan.enable_exclusive = true;
1870 vertlive_scan.src = vertlive;
1871 vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
1872 vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
1873 vertlive_scan.numwaves = get_tgsize(ctx);
1874 vertlive_scan.maxwaves = 8;
1875
1876 ac_build_wg_scan(&ctx->ac, &vertlive_scan);
1877
1878 /* Skip all exports (including index exports) when possible. At least on
1879 * early gfx10 revisions this is also to avoid hangs.
1880 */
1881 LLVMValueRef have_exports =
1882 LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
1883 num_emit_threads =
1884 LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
1885
1886 /* Allocate export space. Send this message as early as possible, to
1887 * hide the latency of the SQ <-> SPI roundtrip.
1888 *
1889 * Note: We could consider compacting primitives for export as well.
1890 * PA processes 1 non-null prim / clock, but it fetches 4 DW of
1891 * prim data per clock and skips null primitives at no additional
1892 * cost. So compacting primitives can only be beneficial when
1893 * there are 4 or more contiguous null primitives in the export
1894 * (in the common case of single-dword prim exports).
1895 */
1896 ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
1897 vertlive_scan.result_reduce, num_emit_threads);
1898
1899 /* Setup the reverse vertex compaction permutation. We re-use stream 1
1900 * of the primitive liveness flags, relying on the fact that each
1901 * threadgroup can have at most 256 threads. */
1902 ac_build_ifcc(&ctx->ac, vertlive, 5130);
1903 {
1904 tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
1905 tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
1906 LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
1907 }
1908 ac_build_endif(&ctx->ac, 5130);
1909
1910 ac_build_s_barrier(&ctx->ac);
1911
1912 /* Export primitive data */
1913 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1914 ac_build_ifcc(&ctx->ac, tmp, 5140);
1915 {
1916 LLVMValueRef flags;
1917 struct ac_ngg_prim prim = {};
1918 prim.num_vertices = verts_per_prim;
1919
1920 tmp = ngg_gs_vertex_ptr(ctx, tid);
1921 flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1922 prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
1923
1924 for (unsigned i = 0; i < verts_per_prim; ++i) {
1925 prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
1926 LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
1927 prim.edgeflag[i] = ctx->ac.i1false;
1928 }
1929
1930 /* Geometry shaders output triangle strips, but NGG expects triangles. */
1931 if (verts_per_prim == 3) {
1932 LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
1933 is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
1934 LLVMValueRef flatshade_first =
1935 LLVMBuildICmp(builder, LLVMIntEQ,
1936 si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
1937 ctx->ac.i32_0, "");
1938
1939 ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
1940 flatshade_first,
1941 prim.index);
1942 }
1943
1944 ac_build_export_prim(&ctx->ac, &prim);
1945 }
1946 ac_build_endif(&ctx->ac, 5140);
1947
1948 /* Export position and parameter data */
1949 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
1950 ac_build_ifcc(&ctx->ac, tmp, 5145);
1951 {
1952 struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
1953
1954 tmp = ngg_gs_vertex_ptr(ctx, tid);
1955 tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
1956 tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
1957 const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
1958
1959 unsigned out_idx = 0;
1960 for (unsigned i = 0; i < info->num_outputs; i++) {
1961 outputs[i].semantic_name = info->output_semantic_name[i];
1962 outputs[i].semantic_index = info->output_semantic_index[i];
1963
1964 for (unsigned j = 0; j < 4; j++, out_idx++) {
1965 tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
1966 tmp = LLVMBuildLoad(builder, tmp, "");
1967 outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
1968 outputs[i].vertex_stream[j] =
1969 (info->output_streams[i] >> (2 * j)) & 3;
1970 }
1971 }
1972
1973 si_llvm_build_vs_exports(ctx, outputs, info->num_outputs);
1974 }
1975 ac_build_endif(&ctx->ac, 5145);
1976 }
1977
1978 static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
1979 unsigned min_verts_per_prim, bool use_adjacency)
1980 {
1981 unsigned max_reuse = max_esverts - min_verts_per_prim;
1982 if (use_adjacency)
1983 max_reuse /= 2;
1984 *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
1985 }
1986
1987 /**
1988 * Determine subgroup information like maximum number of vertices and prims.
1989 *
1990 * This happens before the shader is uploaded, since LDS relocations during
1991 * upload depend on the subgroup size.
1992 */
1993 void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
1994 {
1995 const struct si_shader_selector *gs_sel = shader->selector;
1996 const struct si_shader_selector *es_sel =
1997 shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
1998 const enum pipe_shader_type gs_type = gs_sel->type;
1999 const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
2000 const unsigned input_prim = si_get_input_prim(gs_sel);
2001 const bool use_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY &&
2002 input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
2003 const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);
2004 const unsigned min_verts_per_prim =
2005 gs_type == PIPE_SHADER_GEOMETRY ? max_verts_per_prim : 1;
2006
2007 /* All these are in dwords: */
2008 /* We can't allow using the whole LDS, because GS waves compete with
2009 * other shader stages for LDS space.
2010 *
2011 * TODO: We should really take the shader's internal LDS use into
2012 * account. The linker will fail if the size is greater than
2013 * 8K dwords.
2014 */
2015 const unsigned max_lds_size = 8 * 1024 - 768;
2016 const unsigned target_lds_size = max_lds_size;
2017 unsigned esvert_lds_size = 0;
2018 unsigned gsprim_lds_size = 0;
2019
2020 /* All these are per subgroup: */
2021 bool max_vert_out_per_gs_instance = false;
2022 unsigned max_gsprims_base = 128; /* default prim group size clamp */
2023 unsigned max_esverts_base = 128;
2024
2025 if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
2026 max_gsprims_base = 128 / 3;
2027 max_esverts_base = max_gsprims_base * 3;
2028 } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
2029 max_gsprims_base = 126;
2030 max_esverts_base = 128;
2031 }
2032
2033 /* Hardware has the following non-natural restrictions on the value
2034 * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
2035 * the draw:
2036 * - at most 252 for any line input primitive type
2037 * - at most 251 for any quad input primitive type
2038 * - at most 251 for triangle strips with adjacency (this happens to
2039 * be the natural limit for triangle *lists* with adjacency)
2040 */
2041 max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
2042
2043 if (gs_type == PIPE_SHADER_GEOMETRY) {
2044 unsigned max_out_verts_per_gsprim =
2045 gs_sel->gs_max_out_vertices * gs_num_invocations;
2046
2047 if (max_out_verts_per_gsprim <= 256) {
2048 if (max_out_verts_per_gsprim) {
2049 max_gsprims_base = MIN2(max_gsprims_base,
2050 256 / max_out_verts_per_gsprim);
2051 }
2052 } else {
2053 /* Use special multi-cycling mode in which each GS
2054 * instance gets its own subgroup. Does not work with
2055 * tessellation. */
2056 max_vert_out_per_gs_instance = true;
2057 max_gsprims_base = 1;
2058 max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices;
2059 }
2060
2061 esvert_lds_size = es_sel->esgs_itemsize / 4;
2062 gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
2063 } else {
2064 /* VS and TES. */
2065 /* LDS size for passing data from ES to GS. */
2066 esvert_lds_size = ngg_nogs_vertex_size(shader);
2067 }
2068
2069 unsigned max_gsprims = max_gsprims_base;
2070 unsigned max_esverts = max_esverts_base;
2071
2072 if (esvert_lds_size)
2073 max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
2074 if (gsprim_lds_size)
2075 max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
2076
2077 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2078 clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
2079 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2080
2081 if (esvert_lds_size || gsprim_lds_size) {
2082 /* Now that we have a rough proportionality between esverts
2083 * and gsprims based on the primitive type, scale both of them
2084 * down simultaneously based on required LDS space.
2085 *
2086 * We could be smarter about this if we knew how much vertex
2087 * reuse to expect.
2088 */
2089 unsigned lds_total = max_esverts * esvert_lds_size +
2090 max_gsprims * gsprim_lds_size;
2091 if (lds_total > target_lds_size) {
2092 max_esverts = max_esverts * target_lds_size / lds_total;
2093 max_gsprims = max_gsprims * target_lds_size / lds_total;
2094
2095 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2096 clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
2097 min_verts_per_prim, use_adjacency);
2098 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2099 }
2100 }
2101
2102 /* Round up towards full wave sizes for better ALU utilization. */
2103 if (!max_vert_out_per_gs_instance) {
2104 const unsigned wavesize = gs_sel->screen->ge_wave_size;
2105 unsigned orig_max_esverts;
2106 unsigned orig_max_gsprims;
2107 do {
2108 orig_max_esverts = max_esverts;
2109 orig_max_gsprims = max_gsprims;
2110
2111 max_esverts = align(max_esverts, wavesize);
2112 max_esverts = MIN2(max_esverts, max_esverts_base);
2113 if (esvert_lds_size)
2114 max_esverts = MIN2(max_esverts,
2115 (max_lds_size - max_gsprims * gsprim_lds_size) /
2116 esvert_lds_size);
2117 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2118
2119 max_gsprims = align(max_gsprims, wavesize);
2120 max_gsprims = MIN2(max_gsprims, max_gsprims_base);
2121 if (gsprim_lds_size)
2122 max_gsprims = MIN2(max_gsprims,
2123 (max_lds_size - max_esverts * esvert_lds_size) /
2124 gsprim_lds_size);
2125 clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
2126 min_verts_per_prim, use_adjacency);
2127 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2128 } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
2129 }
2130
2131 /* Hardware restriction: minimum value of max_esverts */
2132 max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim);
2133
2134 unsigned max_out_vertices =
2135 max_vert_out_per_gs_instance ? gs_sel->gs_max_out_vertices :
2136 gs_type == PIPE_SHADER_GEOMETRY ?
2137 max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices :
2138 max_esverts;
2139 assert(max_out_vertices <= 256);
2140
2141 unsigned prim_amp_factor = 1;
2142 if (gs_type == PIPE_SHADER_GEOMETRY) {
2143 /* Number of output primitives per GS input primitive after
2144 * GS instancing. */
2145 prim_amp_factor = gs_sel->gs_max_out_vertices;
2146 }
2147
2148 /* The GE only checks against the maximum number of ES verts after
2149 * allocating a full GS primitive. So we need to ensure that whenever
2150 * this check passes, there is enough space for a full primitive without
2151 * vertex reuse.
2152 */
2153 shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
2154 shader->ngg.max_gsprims = max_gsprims;
2155 shader->ngg.max_out_verts = max_out_vertices;
2156 shader->ngg.prim_amp_factor = prim_amp_factor;
2157 shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
2158
2159 shader->gs_info.esgs_ring_size = 4 * max_esverts * esvert_lds_size;
2160 shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
2161
2162 assert(shader->ngg.hw_max_esverts >= 24); /* HW limitation */
2163 }