9d50409bf3943dd159385634becdcb936020e10e
[mesa.git] / src / gallium / drivers / radeonsi / gfx10_shader_ngg.c
1 /*
2 * Copyright 2017 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "si_pipe.h"
25 #include "si_shader_internal.h"
26
27 #include "sid.h"
28
29 #include "util/u_memory.h"
30 #include "util/u_prim.h"
31 #include "ac_llvm_cull.h"
32
33 static LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx)
34 {
35 return si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
36 }
37
38 static LLVMValueRef get_tgsize(struct si_shader_context *ctx)
39 {
40 return si_unpack_param(ctx, ctx->merged_wave_info, 28, 4);
41 }
42
43 static LLVMValueRef get_thread_id_in_tg(struct si_shader_context *ctx)
44 {
45 LLVMBuilderRef builder = ctx->ac.builder;
46 LLVMValueRef tmp;
47 tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
48 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
49 return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
50 }
51
52 static LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx)
53 {
54 return si_unpack_param(ctx, ctx->gs_tg_info, 12, 9);
55 }
56
57 static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
58 {
59 return si_unpack_param(ctx, ctx->gs_tg_info, 22, 9);
60 }
61
62 static LLVMValueRef ngg_get_ordered_id(struct si_shader_context *ctx)
63 {
64 return si_unpack_param(ctx, ctx->gs_tg_info, 0, 12);
65 }
66
67 static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
68 {
69 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
70
71 return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
72 LLVMConstInt(ctx->i32, GFX10_GS_QUERY_BUF, false));
73 }
74
75 static LLVMValueRef ngg_get_initial_edgeflag(struct si_shader_context *ctx, unsigned index)
76 {
77 if (ctx->type == PIPE_SHADER_VERTEX) {
78 LLVMValueRef tmp;
79 tmp = LLVMBuildLShr(ctx->ac.builder,
80 ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id),
81 LLVMConstInt(ctx->ac.i32, 8 + index, false), "");
82 return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
83 }
84 return ctx->i1false;
85 }
86
87 /**
88 * Return the number of vertices as a constant in \p num_vertices,
89 * and return a more precise value as LLVMValueRef from the function.
90 */
91 static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx,
92 unsigned *num_vertices)
93 {
94 const struct si_shader_info *info = &ctx->shader->selector->info;
95
96 if (ctx->type == PIPE_SHADER_VERTEX) {
97 if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
98 /* Blits always use axis-aligned rectangles with 3 vertices. */
99 *num_vertices = 3;
100 return LLVMConstInt(ctx->i32, 3, 0);
101 } else {
102 /* We always build up all three indices for the prim export
103 * independent of the primitive type. The additional garbage
104 * data shouldn't hurt. This number doesn't matter with
105 * NGG passthrough.
106 */
107 *num_vertices = 3;
108
109 /* Extract OUTPRIM field. */
110 LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2);
111 return LLVMBuildAdd(ctx->ac.builder, num, ctx->i32_1, "");
112 }
113 } else {
114 assert(ctx->type == PIPE_SHADER_TESS_EVAL);
115
116 if (info->properties[TGSI_PROPERTY_TES_POINT_MODE])
117 *num_vertices = 1;
118 else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
119 *num_vertices = 2;
120 else
121 *num_vertices = 3;
122
123 return LLVMConstInt(ctx->i32, *num_vertices, false);
124 }
125 }
126
127 bool gfx10_ngg_export_prim_early(struct si_shader *shader)
128 {
129 struct si_shader_selector *sel = shader->selector;
130
131 assert(shader->key.as_ngg && !shader->key.as_es);
132
133 return sel->type != PIPE_SHADER_GEOMETRY &&
134 !sel->info.writes_edgeflag;
135 }
136
137 void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx)
138 {
139 ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
140 ngg_get_vtx_cnt(ctx),
141 ngg_get_prim_cnt(ctx));
142 }
143
144 void gfx10_ngg_build_export_prim(struct si_shader_context *ctx,
145 LLVMValueRef user_edgeflags[3],
146 LLVMValueRef prim_passthrough)
147 {
148 LLVMBuilderRef builder = ctx->ac.builder;
149
150 if (gfx10_is_ngg_passthrough(ctx->shader) ||
151 ctx->shader->key.opt.ngg_culling) {
152 ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
153 {
154 struct ac_ngg_prim prim = {};
155
156 if (prim_passthrough)
157 prim.passthrough = prim_passthrough;
158 else
159 prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
160
161 /* This is only used with NGG culling, which returns the NGG
162 * passthrough prim export encoding.
163 */
164 if (ctx->shader->selector->info.writes_edgeflag) {
165 unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS;
166 LLVMValueRef edgeflags = LLVMConstInt(ctx->i32, all_bits_no_edgeflags, 0);
167
168 unsigned num_vertices;
169 ngg_get_vertices_per_prim(ctx, &num_vertices);
170
171 for (unsigned i = 0; i < num_vertices; i++) {
172 unsigned shift = 9 + i*10;
173 LLVMValueRef edge;
174
175 edge = LLVMBuildLoad(builder, user_edgeflags[i], "");
176 edge = LLVMBuildZExt(builder, edge, ctx->i32, "");
177 edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->i32, shift, 0), "");
178 edgeflags = LLVMBuildOr(builder, edgeflags, edge, "");
179 }
180 prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, "");
181 }
182
183 ac_build_export_prim(&ctx->ac, &prim);
184 }
185 ac_build_endif(&ctx->ac, 6001);
186 return;
187 }
188
189 ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
190 {
191 struct ac_ngg_prim prim = {};
192
193 ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
194
195 prim.isnull = ctx->ac.i1false;
196 prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
197 prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
198 prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
199
200 for (unsigned i = 0; i < prim.num_vertices; ++i) {
201 prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
202
203 if (ctx->shader->selector->info.writes_edgeflag) {
204 LLVMValueRef edge;
205
206 edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], "");
207 edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, "");
208 prim.edgeflag[i] = edge;
209 }
210 }
211
212 ac_build_export_prim(&ctx->ac, &prim);
213 }
214 ac_build_endif(&ctx->ac, 6001);
215 }
216
217 static void build_streamout_vertex(struct si_shader_context *ctx,
218 LLVMValueRef *so_buffer, LLVMValueRef *wg_offset_dw,
219 unsigned stream, LLVMValueRef offset_vtx,
220 LLVMValueRef vertexptr)
221 {
222 struct si_shader_info *info = &ctx->shader->selector->info;
223 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
224 LLVMBuilderRef builder = ctx->ac.builder;
225 LLVMValueRef offset[4] = {};
226 LLVMValueRef tmp;
227
228 for (unsigned buffer = 0; buffer < 4; ++buffer) {
229 if (!wg_offset_dw[buffer])
230 continue;
231
232 tmp = LLVMBuildMul(builder, offset_vtx,
233 LLVMConstInt(ctx->i32, so->stride[buffer], false), "");
234 tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
235 offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 2, false), "");
236 }
237
238 for (unsigned i = 0; i < so->num_outputs; ++i) {
239 if (so->output[i].stream != stream)
240 continue;
241
242 unsigned reg = so->output[i].register_index;
243 struct si_shader_output_values out;
244 out.semantic_name = info->output_semantic_name[reg];
245 out.semantic_index = info->output_semantic_index[reg];
246
247 for (unsigned comp = 0; comp < 4; comp++) {
248 tmp = ac_build_gep0(&ctx->ac, vertexptr,
249 LLVMConstInt(ctx->i32, 4 * reg + comp, false));
250 out.values[comp] = LLVMBuildLoad(builder, tmp, "");
251 out.vertex_stream[comp] =
252 (info->output_streams[reg] >> (2 * comp)) & 3;
253 }
254
255 si_emit_streamout_output(ctx, so_buffer, offset, &so->output[i], &out);
256 }
257 }
258
259 struct ngg_streamout {
260 LLVMValueRef num_vertices;
261
262 /* per-thread data */
263 LLVMValueRef prim_enable[4]; /* i1 per stream */
264 LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */
265
266 /* Output */
267 LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
268 };
269
270 /**
271 * Build streamout logic.
272 *
273 * Implies a barrier.
274 *
275 * Writes number of emitted primitives to gs_ngg_scratch[4:8].
276 *
277 * Clobbers gs_ngg_scratch[8:].
278 */
279 static void build_streamout(struct si_shader_context *ctx,
280 struct ngg_streamout *nggso)
281 {
282 struct si_shader_info *info = &ctx->shader->selector->info;
283 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
284 LLVMBuilderRef builder = ctx->ac.builder;
285 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
286 LLVMValueRef tid = get_thread_id_in_tg(ctx);
287 LLVMValueRef tmp, tmp2;
288 LLVMValueRef i32_2 = LLVMConstInt(ctx->i32, 2, false);
289 LLVMValueRef i32_4 = LLVMConstInt(ctx->i32, 4, false);
290 LLVMValueRef i32_8 = LLVMConstInt(ctx->i32, 8, false);
291 LLVMValueRef so_buffer[4] = {};
292 unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) +
293 (nggso->vertices[2] ? 1 : 0);
294 LLVMValueRef prim_stride_dw[4] = {};
295 LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->i32);
296 int stream_for_buffer[4] = { -1, -1, -1, -1 };
297 unsigned bufmask_for_stream[4] = {};
298 bool isgs = ctx->type == PIPE_SHADER_GEOMETRY;
299 unsigned scratch_emit_base = isgs ? 4 : 0;
300 LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->i32_0;
301 unsigned scratch_offset_base = isgs ? 8 : 4;
302 LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
303
304 ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
305
306 /* Determine the mapping of streamout buffers to vertex streams. */
307 for (unsigned i = 0; i < so->num_outputs; ++i) {
308 unsigned buf = so->output[i].output_buffer;
309 unsigned stream = so->output[i].stream;
310 assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
311 stream_for_buffer[buf] = stream;
312 bufmask_for_stream[stream] |= 1 << buf;
313 }
314
315 for (unsigned buffer = 0; buffer < 4; ++buffer) {
316 if (stream_for_buffer[buffer] == -1)
317 continue;
318
319 assert(so->stride[buffer]);
320
321 tmp = LLVMConstInt(ctx->i32, so->stride[buffer], false);
322 prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, "");
323 prim_stride_dw_vgpr = ac_build_writelane(
324 &ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
325 LLVMConstInt(ctx->i32, buffer, false));
326
327 so_buffer[buffer] = ac_build_load_to_sgpr(
328 &ctx->ac, buf_ptr,
329 LLVMConstInt(ctx->i32, SI_VS_STREAMOUT_BUF0 + buffer, false));
330 }
331
332 tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->i32_0, "");
333 ac_build_ifcc(&ctx->ac, tmp, 5200);
334 {
335 LLVMTypeRef gdsptr = LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS);
336 LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->i32_0, gdsptr, "");
337
338 /* Advance the streamout offsets in GDS. */
339 LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
340 LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
341
342 tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
343 ac_build_ifcc(&ctx->ac, tmp, 5210);
344 {
345 if (isgs) {
346 tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
347 tmp = LLVMBuildLoad(builder, tmp, "");
348 } else {
349 tmp = ac_build_writelane(&ctx->ac, ctx->i32_0,
350 ngg_get_prim_cnt(ctx), ctx->i32_0);
351 }
352 LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
353
354 unsigned swizzle[4];
355 int unused_stream = -1;
356 for (unsigned stream = 0; stream < 4; ++stream) {
357 if (!info->num_stream_output_components[stream]) {
358 unused_stream = stream;
359 break;
360 }
361 }
362 for (unsigned buffer = 0; buffer < 4; ++buffer) {
363 if (stream_for_buffer[buffer] >= 0) {
364 swizzle[buffer] = stream_for_buffer[buffer];
365 } else {
366 assert(unused_stream >= 0);
367 swizzle[buffer] = unused_stream;
368 }
369 }
370
371 tmp = ac_build_quad_swizzle(&ctx->ac, tmp,
372 swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
373 tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
374
375 LLVMValueRef args[] = {
376 LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
377 tmp,
378 ctx->i32_0, // ordering
379 ctx->i32_0, // scope
380 ctx->ac.i1false, // isVolatile
381 LLVMConstInt(ctx->i32, 4 << 24, false), // OA index
382 ctx->ac.i1true, // wave release
383 ctx->ac.i1true, // wave done
384 };
385 tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add",
386 ctx->i32, args, ARRAY_SIZE(args), 0);
387
388 /* Keep offsets in a VGPR for quick retrieval via readlane by
389 * the first wave for bounds checking, and also store in LDS
390 * for retrieval by all waves later. */
391 LLVMBuildStore(builder, tmp, offsets_vgpr);
392
393 tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac),
394 scratch_offset_basev, "");
395 tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
396 LLVMBuildStore(builder, tmp, tmp2);
397 }
398 ac_build_endif(&ctx->ac, 5210);
399
400 /* Determine the max emit per buffer. This is done via the SALU, in part
401 * because LLVM can't generate divide-by-multiply if we try to do this
402 * via VALU with one lane per buffer.
403 */
404 LLVMValueRef max_emit[4] = {};
405 for (unsigned buffer = 0; buffer < 4; ++buffer) {
406 if (stream_for_buffer[buffer] == -1)
407 continue;
408
409 LLVMValueRef bufsize_dw =
410 LLVMBuildLShr(builder,
411 LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""),
412 i32_2, "");
413
414 tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
415 LLVMValueRef offset_dw =
416 ac_build_readlane(&ctx->ac, tmp,
417 LLVMConstInt(ctx->i32, buffer, false));
418
419 tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
420 tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], "");
421
422 tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
423 max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->i32_0, tmp, "");
424 }
425
426 /* Determine the number of emitted primitives per stream and fixup the
427 * GDS counter if necessary.
428 *
429 * This is complicated by the fact that a single stream can emit to
430 * multiple buffers (but luckily not vice versa).
431 */
432 LLVMValueRef emit_vgpr = ctx->i32_0;
433
434 for (unsigned stream = 0; stream < 4; ++stream) {
435 if (!info->num_stream_output_components[stream])
436 continue;
437
438 tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
439 LLVMValueRef generated =
440 ac_build_readlane(&ctx->ac, tmp,
441 LLVMConstInt(ctx->i32, stream, false));
442
443 LLVMValueRef emit = generated;
444 for (unsigned buffer = 0; buffer < 4; ++buffer) {
445 if (stream_for_buffer[buffer] == stream)
446 emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
447 }
448
449 emit_vgpr = ac_build_writelane(&ctx->ac, emit_vgpr, emit,
450 LLVMConstInt(ctx->i32, stream, false));
451
452 /* Fixup the offset using a plain GDS atomic if we overflowed. */
453 tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
454 ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */
455 tmp = LLVMBuildLShr(builder,
456 LLVMConstInt(ctx->i32, bufmask_for_stream[stream], false),
457 ac_get_thread_id(&ctx->ac), "");
458 tmp = LLVMBuildTrunc(builder, tmp, ctx->i1, "");
459 ac_build_ifcc(&ctx->ac, tmp, 5222);
460 {
461 tmp = LLVMBuildSub(builder, generated, emit, "");
462 tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
463 tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
464 LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
465 LLVMAtomicOrderingMonotonic, false);
466 }
467 ac_build_endif(&ctx->ac, 5222);
468 ac_build_endif(&ctx->ac, 5221);
469 }
470
471 tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
472 ac_build_ifcc(&ctx->ac, tmp, 5225);
473 {
474 tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac),
475 scratch_emit_basev, "");
476 tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
477 LLVMBuildStore(builder, emit_vgpr, tmp);
478 }
479 ac_build_endif(&ctx->ac, 5225);
480 }
481 ac_build_endif(&ctx->ac, 5200);
482
483 /* Determine the workgroup-relative per-thread / primitive offset into
484 * the streamout buffers */
485 struct ac_wg_scan primemit_scan[4] = {};
486
487 if (isgs) {
488 for (unsigned stream = 0; stream < 4; ++stream) {
489 if (!info->num_stream_output_components[stream])
490 continue;
491
492 primemit_scan[stream].enable_exclusive = true;
493 primemit_scan[stream].op = nir_op_iadd;
494 primemit_scan[stream].src = nggso->prim_enable[stream];
495 primemit_scan[stream].scratch =
496 ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
497 LLVMConstInt(ctx->i32, 12 + 8 * stream, false));
498 primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
499 primemit_scan[stream].numwaves = get_tgsize(ctx);
500 primemit_scan[stream].maxwaves = 8;
501 ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
502 }
503 }
504
505 ac_build_s_barrier(&ctx->ac);
506
507 /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
508 LLVMValueRef wgoffset_dw[4] = {};
509
510 {
511 LLVMValueRef scratch_vgpr;
512
513 tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
514 scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
515
516 for (unsigned buffer = 0; buffer < 4; ++buffer) {
517 if (stream_for_buffer[buffer] >= 0) {
518 wgoffset_dw[buffer] = ac_build_readlane(
519 &ctx->ac, scratch_vgpr,
520 LLVMConstInt(ctx->i32, scratch_offset_base + buffer, false));
521 }
522 }
523
524 for (unsigned stream = 0; stream < 4; ++stream) {
525 if (info->num_stream_output_components[stream]) {
526 nggso->emit[stream] = ac_build_readlane(
527 &ctx->ac, scratch_vgpr,
528 LLVMConstInt(ctx->i32, scratch_emit_base + stream, false));
529 }
530 }
531 }
532
533 /* Write out primitive data */
534 for (unsigned stream = 0; stream < 4; ++stream) {
535 if (!info->num_stream_output_components[stream])
536 continue;
537
538 if (isgs) {
539 ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
540 } else {
541 primemit_scan[stream].result_exclusive = tid;
542 }
543
544 tmp = LLVMBuildICmp(builder, LLVMIntULT,
545 primemit_scan[stream].result_exclusive,
546 nggso->emit[stream], "");
547 tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], "");
548 ac_build_ifcc(&ctx->ac, tmp, 5240);
549 {
550 LLVMValueRef offset_vtx =
551 LLVMBuildMul(builder, primemit_scan[stream].result_exclusive,
552 nggso->num_vertices, "");
553
554 for (unsigned i = 0; i < max_num_vertices; ++i) {
555 tmp = LLVMBuildICmp(builder, LLVMIntULT,
556 LLVMConstInt(ctx->i32, i, false),
557 nggso->num_vertices, "");
558 ac_build_ifcc(&ctx->ac, tmp, 5241);
559 build_streamout_vertex(ctx, so_buffer, wgoffset_dw,
560 stream, offset_vtx, nggso->vertices[i]);
561 ac_build_endif(&ctx->ac, 5241);
562 offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->i32_1, "");
563 }
564 }
565 ac_build_endif(&ctx->ac, 5240);
566 }
567 }
568
569 /* LDS layout of ES vertex data for NGG culling. */
570 enum {
571 /* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old
572 * ES thread ID. After vertex compaction, compacted ES threads
573 * store the old thread ID here to copy input VGPRs from uncompacted
574 * ES threads.
575 * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
576 * Byte 2: TES rel patch ID
577 * Byte 3: Unused
578 */
579 lds_byte0_accept_flag = 0,
580 lds_byte0_old_thread_id = 0,
581 lds_byte1_new_thread_id,
582 lds_byte2_tes_rel_patch_id,
583 lds_byte3_unused,
584
585 lds_packed_data = 0, /* lds_byteN_... */
586
587 lds_pos_x,
588 lds_pos_y,
589 lds_pos_z,
590 lds_pos_w,
591 lds_pos_x_div_w,
592 lds_pos_y_div_w,
593 /* If VS: */
594 lds_vertex_id,
595 lds_instance_id, /* optional */
596 /* If TES: */
597 lds_tes_u = lds_vertex_id,
598 lds_tes_v = lds_instance_id,
599 lds_tes_patch_id, /* optional */
600 };
601
602 static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx,
603 LLVMValueRef ptr, unsigned byte_index)
604 {
605 assert(byte_index < 4);
606 LLVMTypeRef pi8 = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS);
607 LLVMValueRef index = LLVMConstInt(ctx->i32, byte_index, 0);
608
609 return LLVMBuildGEP(ctx->ac.builder,
610 LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""),
611 &index, 1, "");
612 }
613
614 static unsigned ngg_nogs_vertex_size(struct si_shader *shader)
615 {
616 unsigned lds_vertex_size = 0;
617
618 /* The edgeflag is always stored in the last element that's also
619 * used for padding to reduce LDS bank conflicts. */
620 if (shader->selector->so.num_outputs)
621 lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
622 if (shader->selector->info.writes_edgeflag)
623 lds_vertex_size = MAX2(lds_vertex_size, 1);
624
625 /* LDS size for passing data from GS to ES.
626 * GS stores Primitive IDs into LDS at the address corresponding
627 * to the ES thread of the provoking vertex. All ES threads
628 * load and export PrimitiveID for their thread.
629 */
630 if (shader->selector->type == PIPE_SHADER_VERTEX &&
631 shader->key.mono.u.vs_export_prim_id)
632 lds_vertex_size = MAX2(lds_vertex_size, 1);
633
634 if (shader->key.opt.ngg_culling) {
635 if (shader->selector->type == PIPE_SHADER_VERTEX) {
636 STATIC_ASSERT(lds_instance_id + 1 == 9);
637 lds_vertex_size = MAX2(lds_vertex_size, 9);
638 } else {
639 assert(shader->selector->type == PIPE_SHADER_TESS_EVAL);
640
641 if (shader->selector->info.uses_primid ||
642 shader->key.mono.u.vs_export_prim_id) {
643 STATIC_ASSERT(lds_tes_patch_id + 2 == 11);
644 lds_vertex_size = MAX2(lds_vertex_size, 11);
645 } else {
646 STATIC_ASSERT(lds_tes_v + 1 == 9);
647 lds_vertex_size = MAX2(lds_vertex_size, 9);
648 }
649 }
650 }
651
652 return lds_vertex_size;
653 }
654
655 /**
656 * Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
657 * for the vertex outputs.
658 */
659 static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx,
660 LLVMValueRef vtxid)
661 {
662 /* The extra dword is used to avoid LDS bank conflicts. */
663 unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader);
664 LLVMTypeRef ai32 = LLVMArrayType(ctx->i32, vertex_size);
665 LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
666 LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
667 return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
668 }
669
670 static void load_bitmasks_2x64(struct si_shader_context *ctx,
671 LLVMValueRef lds_ptr, unsigned dw_offset,
672 LLVMValueRef mask[2], LLVMValueRef *total_bitcount)
673 {
674 LLVMBuilderRef builder = ctx->ac.builder;
675 LLVMValueRef ptr64 = LLVMBuildPointerCast(builder, lds_ptr,
676 LLVMPointerType(LLVMArrayType(ctx->i64, 2),
677 AC_ADDR_SPACE_LDS), "");
678 for (unsigned i = 0; i < 2; i++) {
679 LLVMValueRef index = LLVMConstInt(ctx->i32, dw_offset / 2 + i, 0);
680 mask[i] = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ptr64, index), "");
681 }
682
683 /* We get better code if we don't use the 128-bit bitcount. */
684 *total_bitcount = LLVMBuildAdd(builder, ac_build_bit_count(&ctx->ac, mask[0]),
685 ac_build_bit_count(&ctx->ac, mask[1]), "");
686 }
687
688 /**
689 * Given a total thread count, update total and per-wave thread counts in input SGPRs
690 * and return the per-wave thread count.
691 *
692 * \param new_num_threads Total thread count on the input, per-wave thread count on the output.
693 * \param tg_info tg_info SGPR value
694 * \param tg_info_num_bits the bit size of thread count field in tg_info
695 * \param tg_info_shift the bit offset of the thread count field in tg_info
696 * \param wave_info merged_wave_info SGPR value
697 * \param wave_info_num_bits the bit size of thread count field in merged_wave_info
698 * \param wave_info_shift the bit offset of the thread count field in merged_wave_info
699 */
700 static void update_thread_counts(struct si_shader_context *ctx,
701 LLVMValueRef *new_num_threads,
702 LLVMValueRef *tg_info,
703 unsigned tg_info_num_bits,
704 unsigned tg_info_shift,
705 LLVMValueRef *wave_info,
706 unsigned wave_info_num_bits,
707 unsigned wave_info_shift)
708 {
709 LLVMBuilderRef builder = ctx->ac.builder;
710
711 /* Update the total thread count. */
712 unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift);
713 *tg_info = LLVMBuildAnd(builder, *tg_info,
714 LLVMConstInt(ctx->i32, tg_info_mask, 0), "");
715 *tg_info = LLVMBuildOr(builder, *tg_info,
716 LLVMBuildShl(builder, *new_num_threads,
717 LLVMConstInt(ctx->i32, tg_info_shift, 0), ""), "");
718
719 /* Update the per-wave thread count. */
720 LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
721 LLVMConstInt(ctx->i32, ctx->ac.wave_size, 0), "");
722 *new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, "");
723 *new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->i32_0);
724 *new_num_threads = ac_build_imin(&ctx->ac, *new_num_threads,
725 LLVMConstInt(ctx->i32, ctx->ac.wave_size, 0));
726 unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift);
727 *wave_info = LLVMBuildAnd(builder, *wave_info,
728 LLVMConstInt(ctx->i32, wave_info_mask, 0), "");
729 *wave_info = LLVMBuildOr(builder, *wave_info,
730 LLVMBuildShl(builder, *new_num_threads,
731 LLVMConstInt(ctx->i32, wave_info_shift, 0), ""), "");
732 }
733
734 /**
735 * Cull primitives for NGG VS or TES, then compact vertices, which happens
736 * before the VS or TES main function. Return values for the main function.
737 * Also return the position, which is passed to the shader as an input,
738 * so that we don't compute it twice.
739 */
740 void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
741 unsigned max_outputs,
742 LLVMValueRef *addrs)
743 {
744 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
745 struct si_shader *shader = ctx->shader;
746 struct si_shader_selector *sel = shader->selector;
747 struct si_shader_info *info = &sel->info;
748 LLVMBuilderRef builder = ctx->ac.builder;
749
750 assert(shader->key.opt.ngg_culling);
751 assert(shader->key.as_ngg);
752 assert(sel->type == PIPE_SHADER_VERTEX ||
753 (sel->type == PIPE_SHADER_TESS_EVAL && !shader->key.as_es));
754
755 LLVMValueRef position[4] = {};
756 for (unsigned i = 0; i < info->num_outputs; i++) {
757 switch (info->output_semantic_name[i]) {
758 case TGSI_SEMANTIC_POSITION:
759 for (unsigned j = 0; j < 4; j++) {
760 position[j] = LLVMBuildLoad(ctx->ac.builder,
761 addrs[4 * i + j], "");
762 }
763 break;
764 }
765 }
766 assert(position[0]);
767
768 /* Store Position.XYZW into LDS. */
769 LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
770 for (unsigned chan = 0; chan < 4; chan++) {
771 LLVMBuildStore(builder, ac_to_integer(&ctx->ac, position[chan]),
772 ac_build_gep0(&ctx->ac, es_vtxptr,
773 LLVMConstInt(ctx->i32, lds_pos_x + chan, 0)));
774 }
775 /* Store Position.XY / W into LDS. */
776 for (unsigned chan = 0; chan < 2; chan++) {
777 LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
778 LLVMBuildStore(builder, ac_to_integer(&ctx->ac, val),
779 ac_build_gep0(&ctx->ac, es_vtxptr,
780 LLVMConstInt(ctx->i32, lds_pos_x_div_w + chan, 0)));
781 }
782
783 /* Store VertexID and InstanceID. ES threads will have to load them
784 * from LDS after vertex compaction and use them instead of their own
785 * system values.
786 */
787 bool uses_instance_id = false;
788 bool uses_tes_prim_id = false;
789 LLVMValueRef packed_data = ctx->i32_0;
790
791 if (ctx->type == PIPE_SHADER_VERTEX) {
792 uses_instance_id = sel->info.uses_instanceid ||
793 shader->key.part.vs.prolog.instance_divisor_is_one ||
794 shader->key.part.vs.prolog.instance_divisor_is_fetched;
795
796 LLVMBuildStore(builder, ctx->abi.vertex_id,
797 ac_build_gep0(&ctx->ac, es_vtxptr,
798 LLVMConstInt(ctx->i32, lds_vertex_id, 0)));
799 if (uses_instance_id) {
800 LLVMBuildStore(builder, ctx->abi.instance_id,
801 ac_build_gep0(&ctx->ac, es_vtxptr,
802 LLVMConstInt(ctx->i32, lds_instance_id, 0)));
803 }
804 } else {
805 uses_tes_prim_id = sel->info.uses_primid ||
806 shader->key.mono.u.vs_export_prim_id;
807
808 assert(ctx->type == PIPE_SHADER_TESS_EVAL);
809 LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)),
810 ac_build_gep0(&ctx->ac, es_vtxptr,
811 LLVMConstInt(ctx->i32, lds_tes_u, 0)));
812 LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)),
813 ac_build_gep0(&ctx->ac, es_vtxptr,
814 LLVMConstInt(ctx->i32, lds_tes_v, 0)));
815 packed_data = LLVMBuildShl(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id),
816 LLVMConstInt(ctx->i32, lds_byte2_tes_rel_patch_id * 8, 0), "");
817 if (uses_tes_prim_id) {
818 LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
819 ac_build_gep0(&ctx->ac, es_vtxptr,
820 LLVMConstInt(ctx->i32, lds_tes_patch_id, 0)));
821 }
822 }
823 /* Initialize the packed data. */
824 LLVMBuildStore(builder, packed_data,
825 ac_build_gep0(&ctx->ac, es_vtxptr,
826 LLVMConstInt(ctx->i32, lds_packed_data, 0)));
827 ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
828
829 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
830
831 /* Initialize the last 3 gs_ngg_scratch dwords to 0, because we may have less
832 * than 4 waves, but we always read all 4 values. This is where the thread
833 * bitmasks of unculled threads will be stored.
834 *
835 * gs_ngg_scratch layout: esmask[0..3]
836 */
837 ac_build_ifcc(&ctx->ac,
838 LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx),
839 LLVMConstInt(ctx->i32, 3, 0), ""), 16101);
840 {
841 LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->i32_1, "");
842 LLVMBuildStore(builder, ctx->i32_0,
843 ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, index));
844 }
845 ac_build_endif(&ctx->ac, 16101);
846 ac_build_s_barrier(&ctx->ac);
847
848 /* The hardware requires that there are no holes between unculled vertices,
849 * which means we have to pack ES threads, i.e. reduce the ES thread count
850 * and move ES input VGPRs to lower threads. The upside is that varyings
851 * are only fetched and computed for unculled vertices.
852 *
853 * Vertex compaction in GS threads:
854 *
855 * Part 1: Compute the surviving vertex mask in GS threads:
856 * - Compute 4 32-bit surviving vertex masks in LDS. (max 4 waves)
857 * - In GS, notify ES threads whether the vertex survived.
858 * - Barrier
859 * - ES threads will create the mask and store it in LDS.
860 * - Barrier
861 * - Each GS thread loads the vertex masks from LDS.
862 *
863 * Part 2: Compact ES threads in GS threads:
864 * - Compute the prefix sum for all 3 vertices from the masks. These are the new
865 * thread IDs for each vertex within the primitive.
866 * - Write the value of the old thread ID into the LDS address of the new thread ID.
867 * The ES thread will load the old thread ID and use it to load the position, VertexID,
868 * and InstanceID.
869 * - Update vertex indices and null flag in the GS input VGPRs.
870 * - Barrier
871 *
872 * Part 3: Update inputs GPRs
873 * - For all waves, update per-wave thread counts in input SGPRs.
874 * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
875 */
876
877 LLVMValueRef vtxindex[] = {
878 si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16),
879 si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16),
880 si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16),
881 };
882 LLVMValueRef gs_vtxptr[] = {
883 ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
884 ngg_nogs_vertex_ptr(ctx, vtxindex[1]),
885 ngg_nogs_vertex_ptr(ctx, vtxindex[2]),
886 };
887 es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
888
889 LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->i32, "");
890
891 /* Do culling in GS threads. */
892 ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002);
893 {
894 /* Load positions. */
895 LLVMValueRef pos[3][4] = {};
896 for (unsigned vtx = 0; vtx < 3; vtx++) {
897 for (unsigned chan = 0; chan < 4; chan++) {
898 unsigned index;
899 if (chan == 0 || chan == 1)
900 index = lds_pos_x_div_w + chan;
901 else if (chan == 3)
902 index = lds_pos_w;
903 else
904 continue;
905
906 LLVMValueRef addr = ac_build_gep0(&ctx->ac, gs_vtxptr[vtx],
907 LLVMConstInt(ctx->i32, index, 0));
908 pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
909 pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
910 }
911 }
912
913 /* Load the viewport state for small prim culling. */
914 LLVMValueRef vp = ac_build_load_invariant(&ctx->ac,
915 ac_get_arg(&ctx->ac, ctx->small_prim_cull_info),
916 ctx->i32_0);
917 vp = LLVMBuildBitCast(builder, vp, ctx->v4f32, "");
918 LLVMValueRef vp_scale[2], vp_translate[2];
919 vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
920 vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
921 vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
922 vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
923
924 /* Get the small prim filter precision. */
925 LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4);
926 small_prim_precision = LLVMBuildOr(builder, small_prim_precision,
927 LLVMConstInt(ctx->i32, 0x70, 0), "");
928 small_prim_precision = LLVMBuildShl(builder, small_prim_precision,
929 LLVMConstInt(ctx->i32, 23, 0), "");
930 small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->f32, "");
931
932 /* Execute culling code. */
933 struct ac_cull_options options = {};
934 options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
935 options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
936 options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS;
937 options.cull_small_prims = options.cull_view_xy;
938 options.cull_zero_area = options.cull_front || options.cull_back;
939 options.cull_w = true;
940
941 /* Tell ES threads whether their vertex survived. */
942 ac_build_ifcc(&ctx->ac, ac_cull_triangle(&ctx->ac, pos, ctx->i1true,
943 vp_scale, vp_translate,
944 small_prim_precision, &options), 16003);
945 {
946 LLVMBuildStore(builder, ctx->ac.i32_1, gs_accepted);
947 for (unsigned vtx = 0; vtx < 3; vtx++) {
948 LLVMBuildStore(builder, ctx->ac.i8_1,
949 si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag));
950 }
951 }
952 ac_build_endif(&ctx->ac, 16003);
953 }
954 ac_build_endif(&ctx->ac, 16002);
955 ac_build_s_barrier(&ctx->ac);
956
957 gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
958
959 LLVMValueRef es_accepted = ac_build_alloca(&ctx->ac, ctx->i1, "");
960
961 /* Convert the per-vertex flag to a thread bitmask in ES threads and store it in LDS. */
962 ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007);
963 {
964 LLVMValueRef es_accepted_flag =
965 LLVMBuildLoad(builder,
966 si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), "");
967
968 LLVMValueRef es_accepted_bool = LLVMBuildICmp(builder, LLVMIntNE,
969 es_accepted_flag, ctx->ac.i8_0, "");
970 LLVMValueRef es_mask = ac_get_i1_sgpr_mask(&ctx->ac, es_accepted_bool);
971
972 LLVMBuildStore(builder, es_accepted_bool, es_accepted);
973
974 ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ,
975 tid, ctx->i32_0, ""), 16008);
976 {
977 LLVMBuildStore(builder, es_mask,
978 ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
979 get_wave_id_in_tg(ctx)));
980 }
981 ac_build_endif(&ctx->ac, 16008);
982 }
983 ac_build_endif(&ctx->ac, 16007);
984 ac_build_s_barrier(&ctx->ac);
985
986 /* Load the vertex masks and compute the new ES thread count. */
987 LLVMValueRef es_mask[2], new_num_es_threads, kill_wave;
988 load_bitmasks_2x64(ctx, ctx->gs_ngg_scratch, 0, es_mask, &new_num_es_threads);
989 new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL);
990
991 /* ES threads compute their prefix sum, which is the new ES thread ID.
992 * Then they write the value of the old thread ID into the LDS address
993 * of the new thread ID. It will be used it to load input VGPRs from
994 * the old thread's LDS location.
995 */
996 ac_build_ifcc(&ctx->ac, LLVMBuildLoad(builder, es_accepted, ""), 16009);
997 {
998 LLVMValueRef old_id = get_thread_id_in_tg(ctx);
999 LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id);
1000
1001 LLVMBuildStore(builder, LLVMBuildTrunc(builder, old_id, ctx->i8, ""),
1002 si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id),
1003 lds_byte0_old_thread_id));
1004 LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->i8, ""),
1005 si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
1006 }
1007 ac_build_endif(&ctx->ac, 16009);
1008
1009 /* Kill waves that have inactive threads. */
1010 kill_wave = LLVMBuildICmp(builder, LLVMIntULE,
1011 ac_build_imax(&ctx->ac, new_num_es_threads, ngg_get_prim_cnt(ctx)),
1012 LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
1013 LLVMConstInt(ctx->i32, ctx->ac.wave_size, 0), ""), "");
1014 ac_build_ifcc(&ctx->ac, kill_wave, 19202);
1015 {
1016 /* If we are killing wave 0, send that there are no primitives
1017 * in this threadgroup.
1018 */
1019 ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
1020 ctx->i32_0, ctx->i32_0);
1021 ac_build_s_endpgm(&ctx->ac);
1022 }
1023 ac_build_endif(&ctx->ac, 19202);
1024 ac_build_s_barrier(&ctx->ac);
1025
1026 /* Send the final vertex and primitive counts. */
1027 ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
1028 new_num_es_threads, ngg_get_prim_cnt(ctx));
1029
1030 /* Update thread counts in SGPRs. */
1031 LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->gs_tg_info);
1032 LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->merged_wave_info);
1033
1034 /* This also converts the thread count from the total count to the per-wave count. */
1035 update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12,
1036 &new_merged_wave_info, 8, 0);
1037
1038 /* Update vertex indices in VGPR0 (same format as NGG passthrough). */
1039 LLVMValueRef new_vgpr0 = ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
1040
1041 /* Set the null flag at the beginning (culled), and then
1042 * overwrite it for accepted primitives.
1043 */
1044 LLVMBuildStore(builder, LLVMConstInt(ctx->i32, 1u << 31, 0), new_vgpr0);
1045
1046 /* Get vertex indices after vertex compaction. */
1047 ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->i1, ""), 16011);
1048 {
1049 struct ac_ngg_prim prim = {};
1050 prim.num_vertices = 3;
1051 prim.isnull = ctx->i1false;
1052
1053 for (unsigned vtx = 0; vtx < 3; vtx++) {
1054 prim.index[vtx] =
1055 LLVMBuildLoad(builder,
1056 si_build_gep_i8(ctx, gs_vtxptr[vtx],
1057 lds_byte1_new_thread_id), "");
1058 prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->i32, "");
1059 prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx);
1060 }
1061
1062 /* Set the new GS input VGPR. */
1063 LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0);
1064 }
1065 ac_build_endif(&ctx->ac, 16011);
1066
1067 if (gfx10_ngg_export_prim_early(shader))
1068 gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, ""));
1069
1070 /* Set the new ES input VGPRs. */
1071 LLVMValueRef es_data[4];
1072 LLVMValueRef old_thread_id = ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
1073
1074 for (unsigned i = 0; i < 4; i++)
1075 es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
1076
1077 ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid,
1078 new_num_es_threads, ""), 16012);
1079 {
1080 LLVMValueRef old_id, old_es_vtxptr, tmp;
1081
1082 /* Load ES input VGPRs from the ES thread before compaction. */
1083 old_id = LLVMBuildLoad(builder,
1084 si_build_gep_i8(ctx, es_vtxptr, lds_byte0_old_thread_id), "");
1085 old_id = LLVMBuildZExt(builder, old_id, ctx->i32, "");
1086
1087 LLVMBuildStore(builder, old_id, old_thread_id);
1088 old_es_vtxptr = ngg_nogs_vertex_ptr(ctx, old_id);
1089
1090 for (unsigned i = 0; i < 2; i++) {
1091 tmp = LLVMBuildLoad(builder,
1092 ac_build_gep0(&ctx->ac, old_es_vtxptr,
1093 LLVMConstInt(ctx->i32, lds_vertex_id + i, 0)), "");
1094 LLVMBuildStore(builder, tmp, es_data[i]);
1095 }
1096
1097 if (ctx->type == PIPE_SHADER_TESS_EVAL) {
1098 tmp = LLVMBuildLoad(builder,
1099 si_build_gep_i8(ctx, old_es_vtxptr,
1100 lds_byte2_tes_rel_patch_id), "");
1101 tmp = LLVMBuildZExt(builder, tmp, ctx->i32, "");
1102 LLVMBuildStore(builder, tmp, es_data[2]);
1103
1104 if (uses_tes_prim_id) {
1105 tmp = LLVMBuildLoad(builder,
1106 ac_build_gep0(&ctx->ac, old_es_vtxptr,
1107 LLVMConstInt(ctx->i32, lds_tes_patch_id, 0)), "");
1108 LLVMBuildStore(builder, tmp, es_data[3]);
1109 }
1110 }
1111 }
1112 ac_build_endif(&ctx->ac, 16012);
1113
1114 /* Return values for the main function. */
1115 LLVMValueRef ret = ctx->return_value;
1116 LLVMValueRef val;
1117
1118 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, "");
1119 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, "");
1120 if (ctx->type == PIPE_SHADER_TESS_EVAL)
1121 ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 4);
1122
1123 ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
1124 8 + SI_SGPR_RW_BUFFERS);
1125 ret = si_insert_input_ptr(ctx, ret,
1126 ctx->bindless_samplers_and_images,
1127 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
1128 ret = si_insert_input_ptr(ctx, ret,
1129 ctx->const_and_shader_buffers,
1130 8 + SI_SGPR_CONST_AND_SHADER_BUFFERS);
1131 ret = si_insert_input_ptr(ctx, ret,
1132 ctx->samplers_and_images,
1133 8 + SI_SGPR_SAMPLERS_AND_IMAGES);
1134 ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits,
1135 8 + SI_SGPR_VS_STATE_BITS);
1136
1137 if (ctx->type == PIPE_SHADER_VERTEX) {
1138 ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex,
1139 8 + SI_SGPR_BASE_VERTEX);
1140 ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance,
1141 8 + SI_SGPR_START_INSTANCE);
1142 ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id,
1143 8 + SI_SGPR_DRAWID);
1144 ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers,
1145 8 + SI_VS_NUM_USER_SGPR);
1146 } else {
1147 assert(ctx->type == PIPE_SHADER_TESS_EVAL);
1148 ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout,
1149 8 + SI_SGPR_TES_OFFCHIP_LAYOUT);
1150 ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr,
1151 8 + SI_SGPR_TES_OFFCHIP_ADDR);
1152 }
1153
1154 unsigned vgpr;
1155 if (ctx->type == PIPE_SHADER_VERTEX)
1156 vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
1157 else
1158 vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
1159
1160 val = LLVMBuildLoad(builder, new_vgpr0, "");
1161 ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
1162 vgpr++, "");
1163 vgpr++; /* gs_vtx23_offset */
1164
1165 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
1166 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
1167 vgpr++; /* gs_vtx45_offset */
1168
1169 if (ctx->type == PIPE_SHADER_VERTEX) {
1170 val = LLVMBuildLoad(builder, es_data[0], "");
1171 ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
1172 vgpr++, ""); /* VGPR5 - VertexID */
1173 vgpr += 2;
1174 if (uses_instance_id) {
1175 val = LLVMBuildLoad(builder, es_data[1], "");
1176 ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
1177 vgpr++, ""); /* VGPR8 - InstanceID */
1178 } else {
1179 vgpr++;
1180 }
1181 } else {
1182 assert(ctx->type == PIPE_SHADER_TESS_EVAL);
1183 unsigned num_vgprs = uses_tes_prim_id ? 4 : 3;
1184 for (unsigned i = 0; i < num_vgprs; i++) {
1185 val = LLVMBuildLoad(builder, es_data[i], "");
1186 ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
1187 vgpr++, "");
1188 }
1189 if (num_vgprs == 3)
1190 vgpr++;
1191 }
1192 /* Return the old thread ID. */
1193 val = LLVMBuildLoad(builder, old_thread_id, "");
1194 ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
1195
1196 /* These two also use LDS. */
1197 if (sel->info.writes_edgeflag ||
1198 (ctx->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
1199 ac_build_s_barrier(&ctx->ac);
1200
1201 ctx->return_value = ret;
1202 }
1203
1204 /**
1205 * Emit the epilogue of an API VS or TES shader compiled as ESGS shader.
1206 */
1207 void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
1208 unsigned max_outputs,
1209 LLVMValueRef *addrs)
1210 {
1211 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1212 struct si_shader_selector *sel = ctx->shader->selector;
1213 struct si_shader_info *info = &sel->info;
1214 struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
1215 LLVMBuilderRef builder = ctx->ac.builder;
1216 LLVMValueRef tmp, tmp2;
1217
1218 assert(!ctx->shader->is_gs_copy_shader);
1219 assert(info->num_outputs <= max_outputs);
1220
1221 LLVMValueRef vertex_ptr = NULL;
1222
1223 if (sel->so.num_outputs || sel->info.writes_edgeflag)
1224 vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
1225
1226 for (unsigned i = 0; i < info->num_outputs; i++) {
1227 outputs[i].semantic_name = info->output_semantic_name[i];
1228 outputs[i].semantic_index = info->output_semantic_index[i];
1229
1230 for (unsigned j = 0; j < 4; j++) {
1231 outputs[i].vertex_stream[j] =
1232 (info->output_streams[i] >> (2 * j)) & 3;
1233
1234 /* TODO: we may store more outputs than streamout needs,
1235 * but streamout performance isn't that important.
1236 */
1237 if (sel->so.num_outputs) {
1238 tmp = ac_build_gep0(&ctx->ac, vertex_ptr,
1239 LLVMConstInt(ctx->i32, 4 * i + j, false));
1240 tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], "");
1241 tmp2 = ac_to_integer(&ctx->ac, tmp2);
1242 LLVMBuildStore(builder, tmp2, tmp);
1243 }
1244 }
1245
1246 /* Store the edgeflag at the end (if streamout is enabled) */
1247 if (info->output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG &&
1248 sel->info.writes_edgeflag) {
1249 LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], "");
1250 /* The output is a float, but the hw expects a 1-bit integer. */
1251 edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->i32, "");
1252 edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->i32_1);
1253
1254 tmp = LLVMConstInt(ctx->i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
1255 tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
1256 LLVMBuildStore(builder, edgeflag, tmp);
1257 }
1258 }
1259
1260 bool unterminated_es_if_block =
1261 !sel->so.num_outputs &&
1262 !sel->info.writes_edgeflag &&
1263 !ctx->screen->use_ngg_streamout && /* no query buffer */
1264 (ctx->type != PIPE_SHADER_VERTEX ||
1265 !ctx->shader->key.mono.u.vs_export_prim_id);
1266
1267 if (!unterminated_es_if_block)
1268 ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
1269
1270 LLVMValueRef is_gs_thread = si_is_gs_thread(ctx);
1271 LLVMValueRef is_es_thread = si_is_es_thread(ctx);
1272 LLVMValueRef vtxindex[3];
1273
1274 if (ctx->shader->key.opt.ngg_culling) {
1275 vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9);
1276 vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9);
1277 vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9);
1278 } else {
1279 vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
1280 vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
1281 vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
1282 }
1283
1284 /* Determine the number of vertices per primitive. */
1285 unsigned num_vertices;
1286 LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices);
1287
1288 /* Streamout */
1289 LLVMValueRef emitted_prims = NULL;
1290
1291 if (sel->so.num_outputs) {
1292 assert(!unterminated_es_if_block);
1293
1294 struct ngg_streamout nggso = {};
1295 nggso.num_vertices = num_vertices_val;
1296 nggso.prim_enable[0] = is_gs_thread;
1297
1298 for (unsigned i = 0; i < num_vertices; ++i)
1299 nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1300
1301 build_streamout(ctx, &nggso);
1302 emitted_prims = nggso.emit[0];
1303 }
1304
1305 LLVMValueRef user_edgeflags[3] = {};
1306
1307 if (sel->info.writes_edgeflag) {
1308 assert(!unterminated_es_if_block);
1309
1310 /* Streamout already inserted the barrier, so don't insert it again. */
1311 if (!sel->so.num_outputs)
1312 ac_build_s_barrier(&ctx->ac);
1313
1314 ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1315 /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
1316 for (unsigned i = 0; i < num_vertices; i++) {
1317 tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1318 tmp2 = LLVMConstInt(ctx->i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
1319 tmp = ac_build_gep0(&ctx->ac, tmp, tmp2);
1320 tmp = LLVMBuildLoad(builder, tmp, "");
1321 tmp = LLVMBuildTrunc(builder, tmp, ctx->i1, "");
1322
1323 user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->i1, "");
1324 LLVMBuildStore(builder, tmp, user_edgeflags[i]);
1325 }
1326 ac_build_endif(&ctx->ac, 5400);
1327 }
1328
1329 /* Copy Primitive IDs from GS threads to the LDS address corresponding
1330 * to the ES thread of the provoking vertex.
1331 */
1332 if (ctx->type == PIPE_SHADER_VERTEX &&
1333 ctx->shader->key.mono.u.vs_export_prim_id) {
1334 assert(!unterminated_es_if_block);
1335
1336 /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
1337 if (sel->so.num_outputs || sel->info.writes_edgeflag)
1338 ac_build_s_barrier(&ctx->ac);
1339
1340 ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1341 /* Extract the PROVOKING_VTX_INDEX field. */
1342 LLVMValueRef provoking_vtx_in_prim =
1343 si_unpack_param(ctx, ctx->vs_state_bits, 4, 2);
1344
1345 /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
1346 LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
1347 LLVMValueRef provoking_vtx_index =
1348 LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
1349 LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index);
1350
1351 LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id),
1352 ac_build_gep0(&ctx->ac, vertex_ptr, ctx->i32_0));
1353 ac_build_endif(&ctx->ac, 5400);
1354 }
1355
1356 /* Update query buffer */
1357 if (ctx->screen->use_ngg_streamout &&
1358 !info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
1359 assert(!unterminated_es_if_block);
1360
1361 tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
1362 tmp = LLVMBuildTrunc(builder, tmp, ctx->i1, "");
1363 ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */
1364 tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
1365 ac_build_ifcc(&ctx->ac, tmp, 5030);
1366 tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac),
1367 sel->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, "");
1368 ac_build_ifcc(&ctx->ac, tmp, 5031);
1369 {
1370 LLVMValueRef args[] = {
1371 ngg_get_prim_cnt(ctx),
1372 ngg_get_query_buf(ctx),
1373 LLVMConstInt(ctx->i32, 16, false), /* offset of stream[0].generated_primitives */
1374 ctx->i32_0, /* soffset */
1375 ctx->i32_0, /* cachepolicy */
1376 };
1377
1378 if (sel->so.num_outputs) {
1379 args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->i32_1);
1380 args[2] = ac_build_writelane(&ctx->ac, args[2],
1381 LLVMConstInt(ctx->i32, 24, false), ctx->i32_1);
1382 }
1383
1384 /* TODO: should this be 64-bit atomics? */
1385 ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
1386 ctx->i32, args, 5, 0);
1387 }
1388 ac_build_endif(&ctx->ac, 5031);
1389 ac_build_endif(&ctx->ac, 5030);
1390 ac_build_endif(&ctx->ac, 5029);
1391 }
1392
1393 /* Build the primitive export. */
1394 if (!gfx10_ngg_export_prim_early(ctx->shader)) {
1395 assert(!unterminated_es_if_block);
1396 gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL);
1397 }
1398
1399 /* Export per-vertex data (positions and parameters). */
1400 if (!unterminated_es_if_block)
1401 ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
1402 {
1403 unsigned i;
1404
1405 /* Unconditionally (re-)load the values for proper SSA form. */
1406 for (i = 0; i < info->num_outputs; i++) {
1407 /* If the NGG cull shader part computed the position, don't
1408 * use the position from the current shader part. Instead,
1409 * load it from LDS.
1410 */
1411 if (info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
1412 ctx->shader->key.opt.ngg_culling) {
1413 vertex_ptr = ngg_nogs_vertex_ptr(ctx,
1414 ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id));
1415
1416 for (unsigned j = 0; j < 4; j++) {
1417 tmp = LLVMConstInt(ctx->i32, lds_pos_x + j, 0);
1418 tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
1419 tmp = LLVMBuildLoad(builder, tmp, "");
1420 outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
1421 }
1422 } else {
1423 for (unsigned j = 0; j < 4; j++) {
1424 outputs[i].values[j] =
1425 LLVMBuildLoad(builder,
1426 addrs[4 * i + j], "");
1427 }
1428 }
1429 }
1430
1431 if (ctx->shader->key.mono.u.vs_export_prim_id) {
1432 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
1433 outputs[i].semantic_index = 0;
1434
1435 if (ctx->type == PIPE_SHADER_VERTEX) {
1436 /* Wait for GS stores to finish. */
1437 ac_build_s_barrier(&ctx->ac);
1438
1439 tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
1440 tmp = ac_build_gep0(&ctx->ac, tmp, ctx->i32_0);
1441 outputs[i].values[0] = LLVMBuildLoad(builder, tmp, "");
1442 } else {
1443 assert(ctx->type == PIPE_SHADER_TESS_EVAL);
1444 outputs[i].values[0] = si_get_primitive_id(ctx, 0);
1445 }
1446
1447 outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]);
1448 for (unsigned j = 1; j < 4; j++)
1449 outputs[i].values[j] = LLVMGetUndef(ctx->f32);
1450
1451 memset(outputs[i].vertex_stream, 0,
1452 sizeof(outputs[i].vertex_stream));
1453 i++;
1454 }
1455
1456 si_llvm_export_vs(ctx, outputs, i);
1457 }
1458 ac_build_endif(&ctx->ac, 6002);
1459 }
1460
1461 static LLVMValueRef
1462 ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
1463 {
1464 const struct si_shader_selector *sel = ctx->shader->selector;
1465 const struct si_shader_info *info = &sel->info;
1466
1467 LLVMTypeRef elements[2] = {
1468 LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
1469 LLVMArrayType(ctx->ac.i8, 4),
1470 };
1471 LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
1472 type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
1473 return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
1474 }
1475
1476 /**
1477 * Return a pointer to the LDS storage reserved for the N'th vertex, where N
1478 * is in emit order; that is:
1479 * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
1480 * - during vertex emit, i.e. while the API GS shader invocation is running,
1481 * N = threadidx * gs_max_out_vertices + emitidx
1482 *
1483 * Goals of the LDS memory layout:
1484 * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
1485 * in uniform control flow
1486 * 2. Eliminate bank conflicts on read for export if, additionally, there is no
1487 * culling
1488 * 3. Agnostic to the number of waves (since we don't know it before compiling)
1489 * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
1490 * 5. Avoid wasting memory.
1491 *
1492 * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
1493 * layout, elimination of bank conflicts requires that each vertex occupy an
1494 * odd number of dwords. We use the additional dword to store the output stream
1495 * index as well as a flag to indicate whether this vertex ends a primitive
1496 * for rasterization.
1497 *
1498 * Swizzling is required to satisfy points 1 and 2 simultaneously.
1499 *
1500 * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
1501 * Indices are swizzled in groups of 32, which ensures point 1 without
1502 * disturbing point 2.
1503 *
1504 * \return an LDS pointer to type {[N x i32], [4 x i8]}
1505 */
1506 static LLVMValueRef
1507 ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
1508 {
1509 struct si_shader_selector *sel = ctx->shader->selector;
1510 LLVMBuilderRef builder = ctx->ac.builder;
1511 LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
1512
1513 /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
1514 unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1;
1515 if (write_stride_2exp) {
1516 LLVMValueRef row =
1517 LLVMBuildLShr(builder, vertexidx,
1518 LLVMConstInt(ctx->ac.i32, 5, false), "");
1519 LLVMValueRef swizzle =
1520 LLVMBuildAnd(builder, row,
1521 LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1,
1522 false), "");
1523 vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
1524 }
1525
1526 return ac_build_gep0(&ctx->ac, storage, vertexidx);
1527 }
1528
1529 static LLVMValueRef
1530 ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
1531 LLVMValueRef emitidx)
1532 {
1533 struct si_shader_selector *sel = ctx->shader->selector;
1534 LLVMBuilderRef builder = ctx->ac.builder;
1535 LLVMValueRef tmp;
1536
1537 tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false);
1538 tmp = LLVMBuildMul(builder, tmp, gsthread, "");
1539 const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
1540 return ngg_gs_vertex_ptr(ctx, vertexidx);
1541 }
1542
1543 static LLVMValueRef
1544 ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx, LLVMValueRef vertexptr,
1545 unsigned out_idx)
1546 {
1547 LLVMValueRef gep_idx[3] = {
1548 ctx->ac.i32_0, /* implied C-style array */
1549 ctx->ac.i32_0, /* first struct entry */
1550 LLVMConstInt(ctx->ac.i32, out_idx, false),
1551 };
1552 return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1553 }
1554
1555 static LLVMValueRef
1556 ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx, LLVMValueRef vertexptr,
1557 unsigned stream)
1558 {
1559 LLVMValueRef gep_idx[3] = {
1560 ctx->ac.i32_0, /* implied C-style array */
1561 ctx->ac.i32_1, /* second struct entry */
1562 LLVMConstInt(ctx->ac.i32, stream, false),
1563 };
1564 return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1565 }
1566
1567 void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
1568 unsigned stream,
1569 LLVMValueRef *addrs)
1570 {
1571 const struct si_shader_selector *sel = ctx->shader->selector;
1572 const struct si_shader_info *info = &sel->info;
1573 LLVMBuilderRef builder = ctx->ac.builder;
1574 LLVMValueRef tmp;
1575 const LLVMValueRef vertexidx =
1576 LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1577
1578 /* If this thread has already emitted the declared maximum number of
1579 * vertices, skip the write: excessive vertex emissions are not
1580 * supposed to have any effect.
1581 */
1582 const LLVMValueRef can_emit =
1583 LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
1584 LLVMConstInt(ctx->i32, sel->gs_max_out_vertices, false), "");
1585
1586 tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1587 tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
1588 LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1589
1590 ac_build_ifcc(&ctx->ac, can_emit, 9001);
1591
1592 const LLVMValueRef vertexptr =
1593 ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
1594 unsigned out_idx = 0;
1595 for (unsigned i = 0; i < info->num_outputs; i++) {
1596 for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
1597 if (!(info->output_usagemask[i] & (1 << chan)) ||
1598 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
1599 continue;
1600
1601 LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], "");
1602 out_val = ac_to_integer(&ctx->ac, out_val);
1603 LLVMBuildStore(builder, out_val,
1604 ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
1605 }
1606 }
1607 assert(out_idx * 4 == sel->gsvs_vertex_size);
1608
1609 /* Determine and store whether this vertex completed a primitive. */
1610 const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
1611
1612 tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false);
1613 const LLVMValueRef iscompleteprim =
1614 LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
1615
1616 /* Since the geometry shader emits triangle strips, we need to
1617 * track which primitive is odd and swap vertex indices to get
1618 * the correct vertex order.
1619 */
1620 LLVMValueRef is_odd = ctx->i1false;
1621 if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) {
1622 tmp = LLVMBuildAnd(builder, curverts, ctx->i32_1, "");
1623 is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->i32_1, "");
1624 }
1625
1626 tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
1627 LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
1628
1629 /* The per-vertex primitive flag encoding:
1630 * bit 0: whether this vertex finishes a primitive
1631 * bit 1: whether the primitive is odd (if we are emitting triangle strips)
1632 */
1633 tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
1634 tmp = LLVMBuildOr(builder, tmp,
1635 LLVMBuildShl(builder,
1636 LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""),
1637 ctx->ac.i8_1, ""), "");
1638 LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
1639
1640 tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1641 tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
1642 LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
1643
1644 ac_build_endif(&ctx->ac, 9001);
1645 }
1646
1647 void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
1648 {
1649 /* Zero out the part of LDS scratch that is used to accumulate the
1650 * per-stream generated primitive count.
1651 */
1652 LLVMBuilderRef builder = ctx->ac.builder;
1653 LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
1654 LLVMValueRef tid = get_thread_id_in_tg(ctx);
1655 LLVMValueRef tmp;
1656
1657 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->i32, 4, false), "");
1658 ac_build_ifcc(&ctx->ac, tmp, 5090);
1659 {
1660 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
1661 LLVMBuildStore(builder, ctx->i32_0, ptr);
1662 }
1663 ac_build_endif(&ctx->ac, 5090);
1664
1665 ac_build_s_barrier(&ctx->ac);
1666 }
1667
1668 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
1669 {
1670 const struct si_shader_selector *sel = ctx->shader->selector;
1671 const struct si_shader_info *info = &sel->info;
1672 const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim);
1673 LLVMBuilderRef builder = ctx->ac.builder;
1674 LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
1675 LLVMValueRef tmp, tmp2;
1676
1677 /* Zero out remaining (non-emitted) primitive flags.
1678 *
1679 * Note: Alternatively, we could pass the relevant gs_next_vertex to
1680 * the emit threads via LDS. This is likely worse in the expected
1681 * typical case where each GS thread emits the full set of
1682 * vertices.
1683 */
1684 for (unsigned stream = 0; stream < 4; ++stream) {
1685 if (!info->num_stream_output_components[stream])
1686 continue;
1687
1688 const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
1689
1690 ac_build_bgnloop(&ctx->ac, 5100);
1691
1692 const LLVMValueRef vertexidx =
1693 LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1694 tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
1695 LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
1696 ac_build_ifcc(&ctx->ac, tmp, 5101);
1697 ac_build_break(&ctx->ac);
1698 ac_build_endif(&ctx->ac, 5101);
1699
1700 tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1701 LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1702
1703 tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
1704 LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
1705
1706 ac_build_endloop(&ctx->ac, 5100);
1707 }
1708
1709 /* Accumulate generated primitives counts across the entire threadgroup. */
1710 for (unsigned stream = 0; stream < 4; ++stream) {
1711 if (!info->num_stream_output_components[stream])
1712 continue;
1713
1714 LLVMValueRef numprims =
1715 LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1716 numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
1717
1718 tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->i32_0, "");
1719 ac_build_ifcc(&ctx->ac, tmp, 5105);
1720 {
1721 LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
1722 ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
1723 LLVMConstInt(ctx->i32, stream, false)),
1724 numprims, LLVMAtomicOrderingMonotonic, false);
1725 }
1726 ac_build_endif(&ctx->ac, 5105);
1727 }
1728
1729 ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
1730
1731 ac_build_s_barrier(&ctx->ac);
1732
1733 const LLVMValueRef tid = get_thread_id_in_tg(ctx);
1734 LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
1735
1736 /* Streamout */
1737 if (sel->so.num_outputs) {
1738 struct ngg_streamout nggso = {};
1739
1740 nggso.num_vertices = LLVMConstInt(ctx->i32, verts_per_prim, false);
1741
1742 LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
1743 for (unsigned stream = 0; stream < 4; ++stream) {
1744 if (!info->num_stream_output_components[stream])
1745 continue;
1746
1747 tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
1748 tmp = LLVMBuildTrunc(builder, tmp, ctx->i1, "");
1749 tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1750 nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
1751 }
1752
1753 for (unsigned i = 0; i < verts_per_prim; ++i) {
1754 tmp = LLVMBuildSub(builder, tid,
1755 LLVMConstInt(ctx->i32, verts_per_prim - i - 1, false), "");
1756 tmp = ngg_gs_vertex_ptr(ctx, tmp);
1757 nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->i32_0);
1758 }
1759
1760 build_streamout(ctx, &nggso);
1761 }
1762
1763 /* Write shader query data. */
1764 if (ctx->screen->use_ngg_streamout) {
1765 tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
1766 tmp = LLVMBuildTrunc(builder, tmp, ctx->i1, "");
1767 ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */
1768 unsigned num_query_comps = sel->so.num_outputs ? 8 : 4;
1769 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
1770 LLVMConstInt(ctx->i32, num_query_comps, false), "");
1771 ac_build_ifcc(&ctx->ac, tmp, 5110);
1772 {
1773 LLVMValueRef offset;
1774 tmp = tid;
1775 if (sel->so.num_outputs)
1776 tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->i32, 3, false), "");
1777 offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->i32, 32, false), "");
1778 if (sel->so.num_outputs) {
1779 tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->i32, 2, false), "");
1780 tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->i32, 8, false), "");
1781 offset = LLVMBuildAdd(builder, offset, tmp, "");
1782 }
1783
1784 tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
1785 LLVMValueRef args[] = {
1786 tmp,
1787 ngg_get_query_buf(ctx),
1788 offset,
1789 LLVMConstInt(ctx->i32, 16, false), /* soffset */
1790 ctx->i32_0, /* cachepolicy */
1791 };
1792 ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
1793 ctx->i32, args, 5, 0);
1794 }
1795 ac_build_endif(&ctx->ac, 5110);
1796 ac_build_endif(&ctx->ac, 5109);
1797 }
1798
1799 /* Determine vertex liveness. */
1800 LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
1801
1802 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1803 ac_build_ifcc(&ctx->ac, tmp, 5120);
1804 {
1805 for (unsigned i = 0; i < verts_per_prim; ++i) {
1806 const LLVMValueRef primidx =
1807 LLVMBuildAdd(builder, tid,
1808 LLVMConstInt(ctx->ac.i32, i, false), "");
1809
1810 if (i > 0) {
1811 tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
1812 ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
1813 }
1814
1815 /* Load primitive liveness */
1816 tmp = ngg_gs_vertex_ptr(ctx, primidx);
1817 tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1818 const LLVMValueRef primlive =
1819 LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1820
1821 tmp = LLVMBuildLoad(builder, vertliveptr, "");
1822 tmp = LLVMBuildOr(builder, tmp, primlive, ""),
1823 LLVMBuildStore(builder, tmp, vertliveptr);
1824
1825 if (i > 0)
1826 ac_build_endif(&ctx->ac, 5121 + i);
1827 }
1828 }
1829 ac_build_endif(&ctx->ac, 5120);
1830
1831 /* Inclusive scan addition across the current wave. */
1832 LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
1833 struct ac_wg_scan vertlive_scan = {};
1834 vertlive_scan.op = nir_op_iadd;
1835 vertlive_scan.enable_reduce = true;
1836 vertlive_scan.enable_exclusive = true;
1837 vertlive_scan.src = vertlive;
1838 vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->i32_0);
1839 vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
1840 vertlive_scan.numwaves = get_tgsize(ctx);
1841 vertlive_scan.maxwaves = 8;
1842
1843 ac_build_wg_scan(&ctx->ac, &vertlive_scan);
1844
1845 /* Skip all exports (including index exports) when possible. At least on
1846 * early gfx10 revisions this is also to avoid hangs.
1847 */
1848 LLVMValueRef have_exports =
1849 LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
1850 num_emit_threads =
1851 LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
1852
1853 /* Allocate export space. Send this message as early as possible, to
1854 * hide the latency of the SQ <-> SPI roundtrip.
1855 *
1856 * Note: We could consider compacting primitives for export as well.
1857 * PA processes 1 non-null prim / clock, but it fetches 4 DW of
1858 * prim data per clock and skips null primitives at no additional
1859 * cost. So compacting primitives can only be beneficial when
1860 * there are 4 or more contiguous null primitives in the export
1861 * (in the common case of single-dword prim exports).
1862 */
1863 ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
1864 vertlive_scan.result_reduce, num_emit_threads);
1865
1866 /* Setup the reverse vertex compaction permutation. We re-use stream 1
1867 * of the primitive liveness flags, relying on the fact that each
1868 * threadgroup can have at most 256 threads. */
1869 ac_build_ifcc(&ctx->ac, vertlive, 5130);
1870 {
1871 tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
1872 tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
1873 LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
1874 }
1875 ac_build_endif(&ctx->ac, 5130);
1876
1877 ac_build_s_barrier(&ctx->ac);
1878
1879 /* Export primitive data */
1880 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1881 ac_build_ifcc(&ctx->ac, tmp, 5140);
1882 {
1883 LLVMValueRef flags;
1884 struct ac_ngg_prim prim = {};
1885 prim.num_vertices = verts_per_prim;
1886
1887 tmp = ngg_gs_vertex_ptr(ctx, tid);
1888 flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1889 prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->i1, ""), "");
1890
1891 for (unsigned i = 0; i < verts_per_prim; ++i) {
1892 prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
1893 LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
1894 prim.edgeflag[i] = ctx->ac.i1false;
1895 }
1896
1897 /* Geometry shaders output triangle strips, but NGG expects triangles. */
1898 if (verts_per_prim == 3) {
1899 LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
1900 is_odd = LLVMBuildTrunc(builder, is_odd, ctx->i1, "");
1901 LLVMValueRef flatshade_first =
1902 LLVMBuildICmp(builder, LLVMIntEQ,
1903 si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
1904 ctx->i32_0, "");
1905
1906 ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
1907 flatshade_first,
1908 prim.index);
1909 }
1910
1911 ac_build_export_prim(&ctx->ac, &prim);
1912 }
1913 ac_build_endif(&ctx->ac, 5140);
1914
1915 /* Export position and parameter data */
1916 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
1917 ac_build_ifcc(&ctx->ac, tmp, 5145);
1918 {
1919 struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
1920
1921 tmp = ngg_gs_vertex_ptr(ctx, tid);
1922 tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
1923 tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
1924 const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
1925
1926 unsigned out_idx = 0;
1927 for (unsigned i = 0; i < info->num_outputs; i++) {
1928 outputs[i].semantic_name = info->output_semantic_name[i];
1929 outputs[i].semantic_index = info->output_semantic_index[i];
1930
1931 for (unsigned j = 0; j < 4; j++, out_idx++) {
1932 tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
1933 tmp = LLVMBuildLoad(builder, tmp, "");
1934 outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
1935 outputs[i].vertex_stream[j] =
1936 (info->output_streams[i] >> (2 * j)) & 3;
1937 }
1938 }
1939
1940 si_llvm_export_vs(ctx, outputs, info->num_outputs);
1941 }
1942 ac_build_endif(&ctx->ac, 5145);
1943 }
1944
1945 static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
1946 unsigned min_verts_per_prim, bool use_adjacency)
1947 {
1948 unsigned max_reuse = max_esverts - min_verts_per_prim;
1949 if (use_adjacency)
1950 max_reuse /= 2;
1951 *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
1952 }
1953
1954 /**
1955 * Determine subgroup information like maximum number of vertices and prims.
1956 *
1957 * This happens before the shader is uploaded, since LDS relocations during
1958 * upload depend on the subgroup size.
1959 */
1960 void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
1961 {
1962 const struct si_shader_selector *gs_sel = shader->selector;
1963 const struct si_shader_selector *es_sel =
1964 shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
1965 const enum pipe_shader_type gs_type = gs_sel->type;
1966 const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
1967 const unsigned input_prim = si_get_input_prim(gs_sel);
1968 const bool use_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY &&
1969 input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
1970 const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);
1971 const unsigned min_verts_per_prim =
1972 gs_type == PIPE_SHADER_GEOMETRY ? max_verts_per_prim : 1;
1973
1974 /* All these are in dwords: */
1975 /* We can't allow using the whole LDS, because GS waves compete with
1976 * other shader stages for LDS space.
1977 *
1978 * TODO: We should really take the shader's internal LDS use into
1979 * account. The linker will fail if the size is greater than
1980 * 8K dwords.
1981 */
1982 const unsigned max_lds_size = 8 * 1024 - 768;
1983 const unsigned target_lds_size = max_lds_size;
1984 unsigned esvert_lds_size = 0;
1985 unsigned gsprim_lds_size = 0;
1986
1987 /* All these are per subgroup: */
1988 bool max_vert_out_per_gs_instance = false;
1989 unsigned max_esverts_base = 128;
1990 unsigned max_gsprims_base = 128; /* default prim group size clamp */
1991
1992 /* Hardware has the following non-natural restrictions on the value
1993 * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
1994 * the draw:
1995 * - at most 252 for any line input primitive type
1996 * - at most 251 for any quad input primitive type
1997 * - at most 251 for triangle strips with adjacency (this happens to
1998 * be the natural limit for triangle *lists* with adjacency)
1999 */
2000 max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
2001
2002 if (gs_type == PIPE_SHADER_GEOMETRY) {
2003 unsigned max_out_verts_per_gsprim =
2004 gs_sel->gs_max_out_vertices * gs_num_invocations;
2005
2006 if (max_out_verts_per_gsprim <= 256) {
2007 if (max_out_verts_per_gsprim) {
2008 max_gsprims_base = MIN2(max_gsprims_base,
2009 256 / max_out_verts_per_gsprim);
2010 }
2011 } else {
2012 /* Use special multi-cycling mode in which each GS
2013 * instance gets its own subgroup. Does not work with
2014 * tessellation. */
2015 max_vert_out_per_gs_instance = true;
2016 max_gsprims_base = 1;
2017 max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices;
2018 }
2019
2020 esvert_lds_size = es_sel->esgs_itemsize / 4;
2021 gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
2022 } else {
2023 /* VS and TES. */
2024 /* LDS size for passing data from ES to GS. */
2025 esvert_lds_size = ngg_nogs_vertex_size(shader);
2026 }
2027
2028 unsigned max_gsprims = max_gsprims_base;
2029 unsigned max_esverts = max_esverts_base;
2030
2031 if (esvert_lds_size)
2032 max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
2033 if (gsprim_lds_size)
2034 max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
2035
2036 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2037 clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
2038 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2039
2040 if (esvert_lds_size || gsprim_lds_size) {
2041 /* Now that we have a rough proportionality between esverts
2042 * and gsprims based on the primitive type, scale both of them
2043 * down simultaneously based on required LDS space.
2044 *
2045 * We could be smarter about this if we knew how much vertex
2046 * reuse to expect.
2047 */
2048 unsigned lds_total = max_esverts * esvert_lds_size +
2049 max_gsprims * gsprim_lds_size;
2050 if (lds_total > target_lds_size) {
2051 max_esverts = max_esverts * target_lds_size / lds_total;
2052 max_gsprims = max_gsprims * target_lds_size / lds_total;
2053
2054 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2055 clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
2056 min_verts_per_prim, use_adjacency);
2057 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2058 }
2059 }
2060
2061 /* Round up towards full wave sizes for better ALU utilization. */
2062 if (!max_vert_out_per_gs_instance) {
2063 const unsigned wavesize = gs_sel->screen->ge_wave_size;
2064 unsigned orig_max_esverts;
2065 unsigned orig_max_gsprims;
2066 do {
2067 orig_max_esverts = max_esverts;
2068 orig_max_gsprims = max_gsprims;
2069
2070 max_esverts = align(max_esverts, wavesize);
2071 max_esverts = MIN2(max_esverts, max_esverts_base);
2072 if (esvert_lds_size)
2073 max_esverts = MIN2(max_esverts,
2074 (max_lds_size - max_gsprims * gsprim_lds_size) /
2075 esvert_lds_size);
2076 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2077
2078 max_gsprims = align(max_gsprims, wavesize);
2079 max_gsprims = MIN2(max_gsprims, max_gsprims_base);
2080 if (gsprim_lds_size)
2081 max_gsprims = MIN2(max_gsprims,
2082 (max_lds_size - max_esverts * esvert_lds_size) /
2083 gsprim_lds_size);
2084 clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
2085 min_verts_per_prim, use_adjacency);
2086 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2087 } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
2088 }
2089
2090 /* Hardware restriction: minimum value of max_esverts */
2091 max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim);
2092
2093 unsigned max_out_vertices =
2094 max_vert_out_per_gs_instance ? gs_sel->gs_max_out_vertices :
2095 gs_type == PIPE_SHADER_GEOMETRY ?
2096 max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices :
2097 max_esverts;
2098 assert(max_out_vertices <= 256);
2099
2100 unsigned prim_amp_factor = 1;
2101 if (gs_type == PIPE_SHADER_GEOMETRY) {
2102 /* Number of output primitives per GS input primitive after
2103 * GS instancing. */
2104 prim_amp_factor = gs_sel->gs_max_out_vertices;
2105 }
2106
2107 /* The GE only checks against the maximum number of ES verts after
2108 * allocating a full GS primitive. So we need to ensure that whenever
2109 * this check passes, there is enough space for a full primitive without
2110 * vertex reuse.
2111 */
2112 shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
2113 shader->ngg.max_gsprims = max_gsprims;
2114 shader->ngg.max_out_verts = max_out_vertices;
2115 shader->ngg.prim_amp_factor = prim_amp_factor;
2116 shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
2117
2118 shader->gs_info.esgs_ring_size = 4 * max_esverts * esvert_lds_size;
2119 shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
2120
2121 assert(shader->ngg.hw_max_esverts >= 24); /* HW limitation */
2122 }