radeonsi/gfx10: generate geometry shaders for NGG
[mesa.git] / src / gallium / drivers / radeonsi / gfx10_shader_ngg.c
1 /*
2 * Copyright 2017 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "si_pipe.h"
25 #include "si_shader_internal.h"
26
27 #include "sid.h"
28
29 #include "util/u_memory.h"
30 #include "util/u_prim.h"
31
32 static LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx)
33 {
34 return si_unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
35 }
36
37 static LLVMValueRef get_tgsize(struct si_shader_context *ctx)
38 {
39 return si_unpack_param(ctx, ctx->param_merged_wave_info, 28, 4);
40 }
41
42 static LLVMValueRef get_thread_id_in_tg(struct si_shader_context *ctx)
43 {
44 LLVMBuilderRef builder = ctx->ac.builder;
45 LLVMValueRef tmp;
46 tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
47 LLVMConstInt(ctx->ac.i32, 64, false), "");
48 return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
49 }
50
51 static LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx)
52 {
53 return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
54 LLVMConstInt(ctx->ac.i32, 12, false),
55 LLVMConstInt(ctx->ac.i32, 9, false),
56 false);
57 }
58
59 static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
60 {
61 return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
62 LLVMConstInt(ctx->ac.i32, 22, false),
63 LLVMConstInt(ctx->ac.i32, 9, false),
64 false);
65 }
66
67 /* Send GS Alloc Req message from the first wave of the group to SPI.
68 * Message payload is:
69 * - bits 0..10: vertices in group
70 * - bits 12..22: primitives in group
71 */
72 static void build_sendmsg_gs_alloc_req(struct si_shader_context *ctx,
73 LLVMValueRef vtx_cnt,
74 LLVMValueRef prim_cnt)
75 {
76 LLVMBuilderRef builder = ctx->ac.builder;
77 LLVMValueRef tmp;
78
79 tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
80 ac_build_ifcc(&ctx->ac, tmp, 5020);
81
82 tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),"");
83 tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
84 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp);
85
86 ac_build_endif(&ctx->ac, 5020);
87 }
88
89 struct ngg_prim {
90 unsigned num_vertices;
91 LLVMValueRef isnull;
92 LLVMValueRef index[3];
93 LLVMValueRef edgeflag[3];
94 };
95
96 static void build_export_prim(struct si_shader_context *ctx,
97 const struct ngg_prim *prim)
98 {
99 LLVMBuilderRef builder = ctx->ac.builder;
100 struct ac_export_args args;
101 LLVMValueRef tmp;
102
103 tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, "");
104 args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), "");
105
106 for (unsigned i = 0; i < prim->num_vertices; ++i) {
107 tmp = LLVMBuildShl(builder, prim->index[i],
108 LLVMConstInt(ctx->ac.i32, 10 * i, false), "");
109 args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
110 tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, "");
111 tmp = LLVMBuildShl(builder, tmp,
112 LLVMConstInt(ctx->ac.i32, 10 * i + 9, false), "");
113 args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
114 }
115
116 args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, "");
117 args.out[1] = LLVMGetUndef(ctx->ac.f32);
118 args.out[2] = LLVMGetUndef(ctx->ac.f32);
119 args.out[3] = LLVMGetUndef(ctx->ac.f32);
120
121 args.target = V_008DFC_SQ_EXP_PRIM;
122 args.enabled_channels = 1;
123 args.done = true;
124 args.valid_mask = false;
125 args.compr = false;
126
127 ac_build_export(&ctx->ac, &args);
128 }
129
130 /**
131 * Emit the epilogue of an API VS or TES shader compiled as ESGS shader.
132 */
133 void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
134 unsigned max_outputs,
135 LLVMValueRef *addrs)
136 {
137 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
138 struct tgsi_shader_info *info = &ctx->shader->selector->info;
139 struct si_shader_output_values *outputs = NULL;
140 LLVMBuilderRef builder = ctx->ac.builder;
141 struct lp_build_if_state if_state;
142 LLVMValueRef tmp;
143
144 assert(!ctx->shader->is_gs_copy_shader);
145 assert(info->num_outputs <= max_outputs);
146
147 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
148
149 for (unsigned i = 0; i < info->num_outputs; i++) {
150 outputs[i].semantic_name = info->output_semantic_name[i];
151 outputs[i].semantic_index = info->output_semantic_index[i];
152
153 /* This is used only by streamout. */
154 for (unsigned j = 0; j < 4; j++) {
155 outputs[i].values[j] =
156 LLVMBuildLoad(builder,
157 addrs[4 * i + j],
158 "");
159 outputs[i].vertex_stream[j] =
160 (info->output_streams[i] >> (2 * j)) & 3;
161 }
162 }
163
164 lp_build_endif(&ctx->merged_wrap_if_state);
165
166 LLVMValueRef prims_in_wave = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
167 LLVMValueRef vtx_in_wave = si_unpack_param(ctx, ctx->param_merged_wave_info, 0, 8);
168 LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT,
169 ac_get_thread_id(&ctx->ac), prims_in_wave, "");
170 LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT,
171 ac_get_thread_id(&ctx->ac), vtx_in_wave, "");
172 LLVMValueRef vtxindex[] = {
173 si_unpack_param(ctx, ctx->param_gs_vtx01_offset, 0, 16),
174 si_unpack_param(ctx, ctx->param_gs_vtx01_offset, 16, 16),
175 si_unpack_param(ctx, ctx->param_gs_vtx23_offset, 0, 16),
176 };
177
178 /* Determine the number of vertices per primitive. */
179 unsigned num_vertices;
180 LLVMValueRef num_vertices_val;
181
182 if (ctx->type == PIPE_SHADER_VERTEX) {
183 if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS]) {
184 /* Blits always use axis-aligned rectangles with 3 vertices. */
185 num_vertices = 3;
186 num_vertices_val = LLVMConstInt(ctx->i32, 3, 0);
187 } else {
188 /* Extract OUTPRIM field. */
189 tmp = si_unpack_param(ctx, ctx->param_vs_state_bits, 2, 2);
190 num_vertices_val = LLVMBuildAdd(builder, tmp, ctx->i32_1, "");
191 num_vertices = 3; /* TODO: optimize for points & lines */
192 }
193 } else {
194 assert(ctx->type == PIPE_SHADER_TESS_EVAL);
195
196 if (info->properties[TGSI_PROPERTY_TES_POINT_MODE])
197 num_vertices = 1;
198 else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
199 num_vertices = 2;
200 else
201 num_vertices = 3;
202
203 num_vertices_val = LLVMConstInt(ctx->i32, num_vertices, false);
204 }
205
206 /* TODO: streamout */
207
208 /* TODO: primitive culling */
209
210 build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
211
212 /* Export primitive data to the index buffer. Format is:
213 * - bits 0..8: index 0
214 * - bit 9: edge flag 0
215 * - bits 10..18: index 1
216 * - bit 19: edge flag 1
217 * - bits 20..28: index 2
218 * - bit 29: edge flag 2
219 * - bit 31: null primitive (skip)
220 *
221 * For the first version, we will always build up all three indices
222 * independent of the primitive type. The additional garbage data
223 * shouldn't hurt.
224 *
225 * TODO: culling depends on the primitive type, so can have some
226 * interaction here.
227 */
228 lp_build_if(&if_state, &ctx->gallivm, is_gs_thread);
229 {
230 struct ngg_prim prim = {};
231
232 prim.num_vertices = num_vertices;
233 prim.isnull = ctx->ac.i1false;
234 memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
235
236 for (unsigned i = 0; i < num_vertices; ++i) {
237 tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id,
238 LLVMConstInt(ctx->ac.i32, 8 + i, false), "");
239 prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
240 }
241
242 build_export_prim(ctx, &prim);
243 }
244 lp_build_endif(&if_state);
245
246 /* Export per-vertex data (positions and parameters). */
247 lp_build_if(&if_state, &ctx->gallivm, is_es_thread);
248 {
249 unsigned i;
250
251 /* Unconditionally (re-)load the values for proper SSA form. */
252 for (i = 0; i < info->num_outputs; i++) {
253 for (unsigned j = 0; j < 4; j++) {
254 outputs[i].values[j] =
255 LLVMBuildLoad(builder,
256 addrs[4 * i + j],
257 "");
258 }
259 }
260
261 /* TODO: Vertex shaders have to get PrimitiveID from GS VGPRs. */
262 if (ctx->type == PIPE_SHADER_TESS_EVAL &&
263 ctx->shader->key.mono.u.vs_export_prim_id) {
264 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
265 outputs[i].semantic_index = 0;
266 outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
267 for (unsigned j = 1; j < 4; j++)
268 outputs[i].values[j] = LLVMGetUndef(ctx->f32);
269
270 memset(outputs[i].vertex_stream, 0,
271 sizeof(outputs[i].vertex_stream));
272 i++;
273 }
274
275 si_llvm_export_vs(ctx, outputs, i);
276 }
277 lp_build_endif(&if_state);
278
279 FREE(outputs);
280 }
281
282 static LLVMValueRef
283 ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
284 {
285 const struct si_shader_selector *sel = ctx->shader->selector;
286 const struct tgsi_shader_info *info = &sel->info;
287
288 LLVMTypeRef elements[2] = {
289 LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
290 LLVMArrayType(ctx->ac.i8, 4),
291 };
292 LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
293 type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
294 return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
295 }
296
297 /**
298 * Return a pointer to the LDS storage reserved for the N'th vertex, where N
299 * is in emit order; that is:
300 * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
301 * - during vertex emit, i.e. while the API GS shader invocation is running,
302 * N = threadidx * gs_max_out_vertices + emitidx
303 *
304 * Goals of the LDS memory layout:
305 * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
306 * in uniform control flow
307 * 2. Eliminate bank conflicts on read for export if, additionally, there is no
308 * culling
309 * 3. Agnostic to the number of waves (since we don't know it before compiling)
310 * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
311 * 5. Avoid wasting memory.
312 *
313 * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
314 * layout, elimination of bank conflicts requires that each vertex occupy an
315 * odd number of dwords. We use the additional dword to store the output stream
316 * index as well as a flag to indicate whether this vertex ends a primitive
317 * for rasterization.
318 *
319 * Swizzling is required to satisfy points 1 and 2 simultaneously.
320 *
321 * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
322 * Indices are swizzled in groups of 32, which ensures point 1 without
323 * disturbing point 2.
324 *
325 * \return an LDS pointer to type {[N x i32], [4 x i8]}
326 */
327 static LLVMValueRef
328 ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
329 {
330 struct si_shader_selector *sel = ctx->shader->selector;
331 LLVMBuilderRef builder = ctx->ac.builder;
332 LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
333
334 /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
335 unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1;
336 if (write_stride_2exp) {
337 LLVMValueRef row =
338 LLVMBuildLShr(builder, vertexidx,
339 LLVMConstInt(ctx->ac.i32, 5, false), "");
340 LLVMValueRef swizzle =
341 LLVMBuildAnd(builder, row,
342 LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1,
343 false), "");
344 vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
345 }
346
347 return ac_build_gep0(&ctx->ac, storage, vertexidx);
348 }
349
350 static LLVMValueRef
351 ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
352 LLVMValueRef emitidx)
353 {
354 struct si_shader_selector *sel = ctx->shader->selector;
355 LLVMBuilderRef builder = ctx->ac.builder;
356 LLVMValueRef tmp;
357
358 tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false);
359 tmp = LLVMBuildMul(builder, tmp, gsthread, "");
360 const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
361 return ngg_gs_vertex_ptr(ctx, vertexidx);
362 }
363
364 void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
365 unsigned stream,
366 LLVMValueRef *addrs)
367 {
368 const struct si_shader_selector *sel = ctx->shader->selector;
369 const struct tgsi_shader_info *info = &sel->info;
370 LLVMBuilderRef builder = ctx->ac.builder;
371 struct lp_build_if_state if_state;
372 LLVMValueRef tmp;
373 const LLVMValueRef vertexidx =
374 LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
375
376 /* If this thread has already emitted the declared maximum number of
377 * vertices, skip the write: excessive vertex emissions are not
378 * supposed to have any effect.
379 */
380 const LLVMValueRef can_emit =
381 LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
382 LLVMConstInt(ctx->i32, sel->gs_max_out_vertices, false), "");
383
384 tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
385 tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
386 LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
387
388 lp_build_if(&if_state, &ctx->gallivm, can_emit);
389
390 const LLVMValueRef vertexptr =
391 ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
392 unsigned out_idx = 0;
393 for (unsigned i = 0; i < info->num_outputs; i++) {
394 for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
395 if (!(info->output_usagemask[i] & (1 << chan)) ||
396 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
397 continue;
398
399 LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], "");
400 LLVMValueRef gep_idx[3] = {
401 ctx->ac.i32_0, /* implied C-style array */
402 ctx->ac.i32_0, /* first entry of struct */
403 LLVMConstInt(ctx->ac.i32, out_idx, false),
404 };
405 LLVMValueRef ptr = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, "");
406
407 out_val = ac_to_integer(&ctx->ac, out_val);
408 LLVMBuildStore(builder, out_val, ptr);
409 }
410 }
411 assert(out_idx * 4 == sel->gsvs_vertex_size);
412
413 /* Determine and store whether this vertex completed a primitive. */
414 const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
415
416 tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false);
417 const LLVMValueRef iscompleteprim =
418 LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
419
420 tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
421 LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
422
423 LLVMValueRef gep_idx[3] = {
424 ctx->ac.i32_0, /* implied C-style array */
425 ctx->ac.i32_1, /* second struct entry */
426 LLVMConstInt(ctx->ac.i32, stream, false),
427 };
428 const LLVMValueRef primflagptr =
429 LLVMBuildGEP(builder, vertexptr, gep_idx, 3, "");
430
431 tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
432 LLVMBuildStore(builder, tmp, primflagptr);
433
434 lp_build_endif(&if_state);
435 }
436
437 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
438 {
439 const struct si_shader_selector *sel = ctx->shader->selector;
440 const struct tgsi_shader_info *info = &sel->info;
441 const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim);
442 LLVMBuilderRef builder = ctx->ac.builder;
443 LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
444 LLVMValueRef tmp, tmp2;
445
446 /* Zero out remaining (non-emitted) primitive flags.
447 *
448 * Note: Alternatively, we could pass the relevant gs_next_vertex to
449 * the emit threads via LDS. This is likely worse in the expected
450 * typical case where each GS thread emits the full set of
451 * vertices.
452 */
453 for (unsigned stream = 0; stream < 4; ++stream) {
454 if (!info->num_stream_output_components[stream])
455 continue;
456
457 const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
458
459 ac_build_bgnloop(&ctx->ac, 5100);
460
461 const LLVMValueRef vertexidx =
462 LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
463 tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
464 LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
465 ac_build_ifcc(&ctx->ac, tmp, 5101);
466 ac_build_break(&ctx->ac);
467 ac_build_endif(&ctx->ac, 5101);
468
469 tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
470 LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
471
472 tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
473 LLVMValueRef gep_idx[3] = {
474 ctx->ac.i32_0, /* implied C-style array */
475 ctx->ac.i32_1, /* second entry of struct */
476 LLVMConstInt(ctx->ac.i32, stream, false),
477 };
478 tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
479 LLVMBuildStore(builder, i8_0, tmp);
480
481 ac_build_endloop(&ctx->ac, 5100);
482 }
483
484 lp_build_endif(&ctx->merged_wrap_if_state);
485
486 ac_build_s_barrier(&ctx->ac);
487
488 const LLVMValueRef tid = get_thread_id_in_tg(ctx);
489 LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
490
491 /* TODO: streamout */
492
493 /* TODO: culling */
494
495 /* Determine vertex liveness. */
496 LLVMValueRef vertliveptr = lp_build_alloca(&ctx->gallivm, ctx->ac.i1, "vertexlive");
497
498 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
499 ac_build_ifcc(&ctx->ac, tmp, 5120);
500 {
501 for (unsigned i = 0; i < verts_per_prim; ++i) {
502 const LLVMValueRef primidx =
503 LLVMBuildAdd(builder, tid,
504 LLVMConstInt(ctx->ac.i32, i, false), "");
505
506 if (i > 0) {
507 tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
508 ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
509 }
510
511 /* Load primitive liveness */
512 tmp = ngg_gs_vertex_ptr(ctx, primidx);
513 LLVMValueRef gep_idx[3] = {
514 ctx->ac.i32_0, /* implicit C-style array */
515 ctx->ac.i32_1, /* second value of struct */
516 ctx->ac.i32_0, /* stream 0 */
517 };
518 tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
519 tmp = LLVMBuildLoad(builder, tmp, "");
520 const LLVMValueRef primlive =
521 LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
522
523 tmp = LLVMBuildLoad(builder, vertliveptr, "");
524 tmp = LLVMBuildOr(builder, tmp, primlive, ""),
525 LLVMBuildStore(builder, tmp, vertliveptr);
526
527 if (i > 0)
528 ac_build_endif(&ctx->ac, 5121 + i);
529 }
530 }
531 ac_build_endif(&ctx->ac, 5120);
532
533 /* Inclusive scan addition across the current wave. */
534 LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
535 struct ac_wg_scan vertlive_scan = {};
536 vertlive_scan.op = nir_op_iadd;
537 vertlive_scan.enable_reduce = true;
538 vertlive_scan.enable_exclusive = true;
539 vertlive_scan.src = vertlive;
540 vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->i32_0);
541 vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
542 vertlive_scan.numwaves = get_tgsize(ctx);
543 vertlive_scan.maxwaves = 8;
544
545 ac_build_wg_scan(&ctx->ac, &vertlive_scan);
546
547 /* Skip all exports (including index exports) when possible. At least on
548 * early gfx10 revisions this is also to avoid hangs.
549 */
550 LLVMValueRef have_exports =
551 LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
552 num_emit_threads =
553 LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
554
555 /* Allocate export space. Send this message as early as possible, to
556 * hide the latency of the SQ <-> SPI roundtrip.
557 *
558 * Note: We could consider compacting primitives for export as well.
559 * PA processes 1 non-null prim / clock, but it fetches 4 DW of
560 * prim data per clock and skips null primitives at no additional
561 * cost. So compacting primitives can only be beneficial when
562 * there are 4 or more contiguous null primitives in the export
563 * (in the common case of single-dword prim exports).
564 */
565 build_sendmsg_gs_alloc_req(ctx, vertlive_scan.result_reduce, num_emit_threads);
566
567 /* Setup the reverse vertex compaction permutation. We re-use stream 1
568 * of the primitive liveness flags, relying on the fact that each
569 * threadgroup can have at most 256 threads. */
570 ac_build_ifcc(&ctx->ac, vertlive, 5130);
571 {
572 tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
573 LLVMValueRef gep_idx[3] = {
574 ctx->ac.i32_0, /* implicit C-style array */
575 ctx->ac.i32_1, /* second value of struct */
576 ctx->ac.i32_1, /* stream 1 */
577 };
578 tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
579 tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
580 LLVMBuildStore(builder, tmp2, tmp);
581 }
582 ac_build_endif(&ctx->ac, 5130);
583
584 ac_build_s_barrier(&ctx->ac);
585
586 /* Export primitive data */
587 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
588 ac_build_ifcc(&ctx->ac, tmp, 5140);
589 {
590 struct ngg_prim prim = {};
591 prim.num_vertices = verts_per_prim;
592
593 tmp = ngg_gs_vertex_ptr(ctx, tid);
594 LLVMValueRef gep_idx[3] = {
595 ctx->ac.i32_0, /* implicit C-style array */
596 ctx->ac.i32_1, /* second value of struct */
597 ctx->ac.i32_0, /* primflag */
598 };
599 tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
600 tmp = LLVMBuildLoad(builder, tmp, "");
601 prim.isnull = LLVMBuildICmp(builder, LLVMIntEQ, tmp,
602 LLVMConstInt(ctx->ac.i8, 0, false), "");
603
604 for (unsigned i = 0; i < verts_per_prim; ++i) {
605 prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
606 LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
607 prim.edgeflag[i] = ctx->ac.i1false;
608 }
609
610 build_export_prim(ctx, &prim);
611 }
612 ac_build_endif(&ctx->ac, 5140);
613
614 /* Export position and parameter data */
615 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
616 ac_build_ifcc(&ctx->ac, tmp, 5145);
617 {
618 struct si_shader_output_values *outputs = NULL;
619 outputs = MALLOC(info->num_outputs * sizeof(outputs[0]));
620
621 tmp = ngg_gs_vertex_ptr(ctx, tid);
622 LLVMValueRef gep_idx[3] = {
623 ctx->ac.i32_0, /* implicit C-style array */
624 ctx->ac.i32_1, /* second value of struct */
625 ctx->ac.i32_1, /* stream 1: source data index */
626 };
627 tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
628 tmp = LLVMBuildLoad(builder, tmp, "");
629 tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
630 const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
631
632 unsigned out_idx = 0;
633 gep_idx[1] = ctx->ac.i32_0;
634 for (unsigned i = 0; i < info->num_outputs; i++) {
635 outputs[i].semantic_name = info->output_semantic_name[i];
636 outputs[i].semantic_index = info->output_semantic_index[i];
637
638 for (unsigned j = 0; j < 4; j++, out_idx++) {
639 gep_idx[2] = LLVMConstInt(ctx->ac.i32, out_idx, false);
640 tmp = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, "");
641 tmp = LLVMBuildLoad(builder, tmp, "");
642 outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
643 outputs[i].vertex_stream[j] =
644 (info->output_streams[i] >> (2 * j)) & 3;
645 }
646 }
647
648 si_llvm_export_vs(ctx, outputs, info->num_outputs);
649
650 FREE(outputs);
651 }
652 ac_build_endif(&ctx->ac, 5145);
653 }