2 * Copyright 2017 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 #include "si_shader_internal.h"
29 #include "util/u_memory.h"
30 #include "util/u_prim.h"
32 static LLVMValueRef
get_wave_id_in_tg(struct si_shader_context
*ctx
)
34 return si_unpack_param(ctx
, ctx
->param_merged_wave_info
, 24, 4);
37 static LLVMValueRef
get_tgsize(struct si_shader_context
*ctx
)
39 return si_unpack_param(ctx
, ctx
->param_merged_wave_info
, 28, 4);
42 static LLVMValueRef
get_thread_id_in_tg(struct si_shader_context
*ctx
)
44 LLVMBuilderRef builder
= ctx
->ac
.builder
;
46 tmp
= LLVMBuildMul(builder
, get_wave_id_in_tg(ctx
),
47 LLVMConstInt(ctx
->ac
.i32
, 64, false), "");
48 return LLVMBuildAdd(builder
, tmp
, ac_get_thread_id(&ctx
->ac
), "");
51 static LLVMValueRef
ngg_get_vtx_cnt(struct si_shader_context
*ctx
)
53 return ac_build_bfe(&ctx
->ac
, ctx
->gs_tg_info
,
54 LLVMConstInt(ctx
->ac
.i32
, 12, false),
55 LLVMConstInt(ctx
->ac
.i32
, 9, false),
59 static LLVMValueRef
ngg_get_prim_cnt(struct si_shader_context
*ctx
)
61 return ac_build_bfe(&ctx
->ac
, ctx
->gs_tg_info
,
62 LLVMConstInt(ctx
->ac
.i32
, 22, false),
63 LLVMConstInt(ctx
->ac
.i32
, 9, false),
67 /* Send GS Alloc Req message from the first wave of the group to SPI.
69 * - bits 0..10: vertices in group
70 * - bits 12..22: primitives in group
72 static void build_sendmsg_gs_alloc_req(struct si_shader_context
*ctx
,
74 LLVMValueRef prim_cnt
)
76 LLVMBuilderRef builder
= ctx
->ac
.builder
;
79 tmp
= LLVMBuildICmp(builder
, LLVMIntEQ
, get_wave_id_in_tg(ctx
), ctx
->ac
.i32_0
, "");
80 ac_build_ifcc(&ctx
->ac
, tmp
, 5020);
82 tmp
= LLVMBuildShl(builder
, prim_cnt
, LLVMConstInt(ctx
->ac
.i32
, 12, false),"");
83 tmp
= LLVMBuildOr(builder
, tmp
, vtx_cnt
, "");
84 ac_build_sendmsg(&ctx
->ac
, AC_SENDMSG_GS_ALLOC_REQ
, tmp
);
86 ac_build_endif(&ctx
->ac
, 5020);
90 unsigned num_vertices
;
92 LLVMValueRef index
[3];
93 LLVMValueRef edgeflag
[3];
96 static void build_export_prim(struct si_shader_context
*ctx
,
97 const struct ngg_prim
*prim
)
99 LLVMBuilderRef builder
= ctx
->ac
.builder
;
100 struct ac_export_args args
;
103 tmp
= LLVMBuildZExt(builder
, prim
->isnull
, ctx
->ac
.i32
, "");
104 args
.out
[0] = LLVMBuildShl(builder
, tmp
, LLVMConstInt(ctx
->ac
.i32
, 31, false), "");
106 for (unsigned i
= 0; i
< prim
->num_vertices
; ++i
) {
107 tmp
= LLVMBuildShl(builder
, prim
->index
[i
],
108 LLVMConstInt(ctx
->ac
.i32
, 10 * i
, false), "");
109 args
.out
[0] = LLVMBuildOr(builder
, args
.out
[0], tmp
, "");
110 tmp
= LLVMBuildZExt(builder
, prim
->edgeflag
[i
], ctx
->ac
.i32
, "");
111 tmp
= LLVMBuildShl(builder
, tmp
,
112 LLVMConstInt(ctx
->ac
.i32
, 10 * i
+ 9, false), "");
113 args
.out
[0] = LLVMBuildOr(builder
, args
.out
[0], tmp
, "");
116 args
.out
[0] = LLVMBuildBitCast(builder
, args
.out
[0], ctx
->ac
.f32
, "");
117 args
.out
[1] = LLVMGetUndef(ctx
->ac
.f32
);
118 args
.out
[2] = LLVMGetUndef(ctx
->ac
.f32
);
119 args
.out
[3] = LLVMGetUndef(ctx
->ac
.f32
);
121 args
.target
= V_008DFC_SQ_EXP_PRIM
;
122 args
.enabled_channels
= 1;
124 args
.valid_mask
= false;
127 ac_build_export(&ctx
->ac
, &args
);
131 * Emit the epilogue of an API VS or TES shader compiled as ESGS shader.
133 void gfx10_emit_ngg_epilogue(struct ac_shader_abi
*abi
,
134 unsigned max_outputs
,
137 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
138 struct tgsi_shader_info
*info
= &ctx
->shader
->selector
->info
;
139 struct si_shader_output_values
*outputs
= NULL
;
140 LLVMBuilderRef builder
= ctx
->ac
.builder
;
141 struct lp_build_if_state if_state
;
144 assert(!ctx
->shader
->is_gs_copy_shader
);
145 assert(info
->num_outputs
<= max_outputs
);
147 outputs
= MALLOC((info
->num_outputs
+ 1) * sizeof(outputs
[0]));
149 for (unsigned i
= 0; i
< info
->num_outputs
; i
++) {
150 outputs
[i
].semantic_name
= info
->output_semantic_name
[i
];
151 outputs
[i
].semantic_index
= info
->output_semantic_index
[i
];
153 /* This is used only by streamout. */
154 for (unsigned j
= 0; j
< 4; j
++) {
155 outputs
[i
].values
[j
] =
156 LLVMBuildLoad(builder
,
159 outputs
[i
].vertex_stream
[j
] =
160 (info
->output_streams
[i
] >> (2 * j
)) & 3;
164 lp_build_endif(&ctx
->merged_wrap_if_state
);
166 LLVMValueRef prims_in_wave
= si_unpack_param(ctx
, ctx
->param_merged_wave_info
, 8, 8);
167 LLVMValueRef vtx_in_wave
= si_unpack_param(ctx
, ctx
->param_merged_wave_info
, 0, 8);
168 LLVMValueRef is_gs_thread
= LLVMBuildICmp(builder
, LLVMIntULT
,
169 ac_get_thread_id(&ctx
->ac
), prims_in_wave
, "");
170 LLVMValueRef is_es_thread
= LLVMBuildICmp(builder
, LLVMIntULT
,
171 ac_get_thread_id(&ctx
->ac
), vtx_in_wave
, "");
172 LLVMValueRef vtxindex
[] = {
173 si_unpack_param(ctx
, ctx
->param_gs_vtx01_offset
, 0, 16),
174 si_unpack_param(ctx
, ctx
->param_gs_vtx01_offset
, 16, 16),
175 si_unpack_param(ctx
, ctx
->param_gs_vtx23_offset
, 0, 16),
178 /* Determine the number of vertices per primitive. */
179 unsigned num_vertices
;
180 LLVMValueRef num_vertices_val
;
182 if (ctx
->type
== PIPE_SHADER_VERTEX
) {
183 if (info
->properties
[TGSI_PROPERTY_VS_BLIT_SGPRS
]) {
184 /* Blits always use axis-aligned rectangles with 3 vertices. */
186 num_vertices_val
= LLVMConstInt(ctx
->i32
, 3, 0);
188 /* Extract OUTPRIM field. */
189 tmp
= si_unpack_param(ctx
, ctx
->param_vs_state_bits
, 2, 2);
190 num_vertices_val
= LLVMBuildAdd(builder
, tmp
, ctx
->i32_1
, "");
191 num_vertices
= 3; /* TODO: optimize for points & lines */
194 assert(ctx
->type
== PIPE_SHADER_TESS_EVAL
);
196 if (info
->properties
[TGSI_PROPERTY_TES_POINT_MODE
])
198 else if (info
->properties
[TGSI_PROPERTY_TES_PRIM_MODE
] == PIPE_PRIM_LINES
)
203 num_vertices_val
= LLVMConstInt(ctx
->i32
, num_vertices
, false);
206 /* TODO: streamout */
208 /* TODO: primitive culling */
210 build_sendmsg_gs_alloc_req(ctx
, ngg_get_vtx_cnt(ctx
), ngg_get_prim_cnt(ctx
));
212 /* Export primitive data to the index buffer. Format is:
213 * - bits 0..8: index 0
214 * - bit 9: edge flag 0
215 * - bits 10..18: index 1
216 * - bit 19: edge flag 1
217 * - bits 20..28: index 2
218 * - bit 29: edge flag 2
219 * - bit 31: null primitive (skip)
221 * For the first version, we will always build up all three indices
222 * independent of the primitive type. The additional garbage data
225 * TODO: culling depends on the primitive type, so can have some
228 lp_build_if(&if_state
, &ctx
->gallivm
, is_gs_thread
);
230 struct ngg_prim prim
= {};
232 prim
.num_vertices
= num_vertices
;
233 prim
.isnull
= ctx
->ac
.i1false
;
234 memcpy(prim
.index
, vtxindex
, sizeof(vtxindex
[0]) * 3);
236 for (unsigned i
= 0; i
< num_vertices
; ++i
) {
237 tmp
= LLVMBuildLShr(builder
, ctx
->abi
.gs_invocation_id
,
238 LLVMConstInt(ctx
->ac
.i32
, 8 + i
, false), "");
239 prim
.edgeflag
[i
] = LLVMBuildTrunc(builder
, tmp
, ctx
->ac
.i1
, "");
242 build_export_prim(ctx
, &prim
);
244 lp_build_endif(&if_state
);
246 /* Export per-vertex data (positions and parameters). */
247 lp_build_if(&if_state
, &ctx
->gallivm
, is_es_thread
);
251 /* Unconditionally (re-)load the values for proper SSA form. */
252 for (i
= 0; i
< info
->num_outputs
; i
++) {
253 for (unsigned j
= 0; j
< 4; j
++) {
254 outputs
[i
].values
[j
] =
255 LLVMBuildLoad(builder
,
261 /* TODO: Vertex shaders have to get PrimitiveID from GS VGPRs. */
262 if (ctx
->type
== PIPE_SHADER_TESS_EVAL
&&
263 ctx
->shader
->key
.mono
.u
.vs_export_prim_id
) {
264 outputs
[i
].semantic_name
= TGSI_SEMANTIC_PRIMID
;
265 outputs
[i
].semantic_index
= 0;
266 outputs
[i
].values
[0] = ac_to_float(&ctx
->ac
, si_get_primitive_id(ctx
, 0));
267 for (unsigned j
= 1; j
< 4; j
++)
268 outputs
[i
].values
[j
] = LLVMGetUndef(ctx
->f32
);
270 memset(outputs
[i
].vertex_stream
, 0,
271 sizeof(outputs
[i
].vertex_stream
));
275 si_llvm_export_vs(ctx
, outputs
, i
);
277 lp_build_endif(&if_state
);
283 ngg_gs_get_vertex_storage(struct si_shader_context
*ctx
)
285 const struct si_shader_selector
*sel
= ctx
->shader
->selector
;
286 const struct tgsi_shader_info
*info
= &sel
->info
;
288 LLVMTypeRef elements
[2] = {
289 LLVMArrayType(ctx
->ac
.i32
, 4 * info
->num_outputs
),
290 LLVMArrayType(ctx
->ac
.i8
, 4),
292 LLVMTypeRef type
= LLVMStructTypeInContext(ctx
->ac
.context
, elements
, 2, false);
293 type
= LLVMPointerType(LLVMArrayType(type
, 0), AC_ADDR_SPACE_LDS
);
294 return LLVMBuildBitCast(ctx
->ac
.builder
, ctx
->gs_ngg_emit
, type
, "");
298 * Return a pointer to the LDS storage reserved for the N'th vertex, where N
299 * is in emit order; that is:
300 * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
301 * - during vertex emit, i.e. while the API GS shader invocation is running,
302 * N = threadidx * gs_max_out_vertices + emitidx
304 * Goals of the LDS memory layout:
305 * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
306 * in uniform control flow
307 * 2. Eliminate bank conflicts on read for export if, additionally, there is no
309 * 3. Agnostic to the number of waves (since we don't know it before compiling)
310 * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
311 * 5. Avoid wasting memory.
313 * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
314 * layout, elimination of bank conflicts requires that each vertex occupy an
315 * odd number of dwords. We use the additional dword to store the output stream
316 * index as well as a flag to indicate whether this vertex ends a primitive
319 * Swizzling is required to satisfy points 1 and 2 simultaneously.
321 * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
322 * Indices are swizzled in groups of 32, which ensures point 1 without
323 * disturbing point 2.
325 * \return an LDS pointer to type {[N x i32], [4 x i8]}
328 ngg_gs_vertex_ptr(struct si_shader_context
*ctx
, LLVMValueRef vertexidx
)
330 struct si_shader_selector
*sel
= ctx
->shader
->selector
;
331 LLVMBuilderRef builder
= ctx
->ac
.builder
;
332 LLVMValueRef storage
= ngg_gs_get_vertex_storage(ctx
);
334 /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
335 unsigned write_stride_2exp
= ffs(sel
->gs_max_out_vertices
) - 1;
336 if (write_stride_2exp
) {
338 LLVMBuildLShr(builder
, vertexidx
,
339 LLVMConstInt(ctx
->ac
.i32
, 5, false), "");
340 LLVMValueRef swizzle
=
341 LLVMBuildAnd(builder
, row
,
342 LLVMConstInt(ctx
->ac
.i32
, (1u << write_stride_2exp
) - 1,
344 vertexidx
= LLVMBuildXor(builder
, vertexidx
, swizzle
, "");
347 return ac_build_gep0(&ctx
->ac
, storage
, vertexidx
);
351 ngg_gs_emit_vertex_ptr(struct si_shader_context
*ctx
, LLVMValueRef gsthread
,
352 LLVMValueRef emitidx
)
354 struct si_shader_selector
*sel
= ctx
->shader
->selector
;
355 LLVMBuilderRef builder
= ctx
->ac
.builder
;
358 tmp
= LLVMConstInt(ctx
->ac
.i32
, sel
->gs_max_out_vertices
, false);
359 tmp
= LLVMBuildMul(builder
, tmp
, gsthread
, "");
360 const LLVMValueRef vertexidx
= LLVMBuildAdd(builder
, tmp
, emitidx
, "");
361 return ngg_gs_vertex_ptr(ctx
, vertexidx
);
364 void gfx10_ngg_gs_emit_vertex(struct si_shader_context
*ctx
,
368 const struct si_shader_selector
*sel
= ctx
->shader
->selector
;
369 const struct tgsi_shader_info
*info
= &sel
->info
;
370 LLVMBuilderRef builder
= ctx
->ac
.builder
;
371 struct lp_build_if_state if_state
;
373 const LLVMValueRef vertexidx
=
374 LLVMBuildLoad(builder
, ctx
->gs_next_vertex
[stream
], "");
376 /* If this thread has already emitted the declared maximum number of
377 * vertices, skip the write: excessive vertex emissions are not
378 * supposed to have any effect.
380 const LLVMValueRef can_emit
=
381 LLVMBuildICmp(builder
, LLVMIntULT
, vertexidx
,
382 LLVMConstInt(ctx
->i32
, sel
->gs_max_out_vertices
, false), "");
384 tmp
= LLVMBuildAdd(builder
, vertexidx
, ctx
->ac
.i32_1
, "");
385 tmp
= LLVMBuildSelect(builder
, can_emit
, tmp
, vertexidx
, "");
386 LLVMBuildStore(builder
, tmp
, ctx
->gs_next_vertex
[stream
]);
388 lp_build_if(&if_state
, &ctx
->gallivm
, can_emit
);
390 const LLVMValueRef vertexptr
=
391 ngg_gs_emit_vertex_ptr(ctx
, get_thread_id_in_tg(ctx
), vertexidx
);
392 unsigned out_idx
= 0;
393 for (unsigned i
= 0; i
< info
->num_outputs
; i
++) {
394 for (unsigned chan
= 0; chan
< 4; chan
++, out_idx
++) {
395 if (!(info
->output_usagemask
[i
] & (1 << chan
)) ||
396 ((info
->output_streams
[i
] >> (2 * chan
)) & 3) != stream
)
399 LLVMValueRef out_val
= LLVMBuildLoad(builder
, addrs
[4 * i
+ chan
], "");
400 LLVMValueRef gep_idx
[3] = {
401 ctx
->ac
.i32_0
, /* implied C-style array */
402 ctx
->ac
.i32_0
, /* first entry of struct */
403 LLVMConstInt(ctx
->ac
.i32
, out_idx
, false),
405 LLVMValueRef ptr
= LLVMBuildGEP(builder
, vertexptr
, gep_idx
, 3, "");
407 out_val
= ac_to_integer(&ctx
->ac
, out_val
);
408 LLVMBuildStore(builder
, out_val
, ptr
);
411 assert(out_idx
* 4 == sel
->gsvs_vertex_size
);
413 /* Determine and store whether this vertex completed a primitive. */
414 const LLVMValueRef curverts
= LLVMBuildLoad(builder
, ctx
->gs_curprim_verts
[stream
], "");
416 tmp
= LLVMConstInt(ctx
->ac
.i32
, u_vertices_per_prim(sel
->gs_output_prim
) - 1, false);
417 const LLVMValueRef iscompleteprim
=
418 LLVMBuildICmp(builder
, LLVMIntUGE
, curverts
, tmp
, "");
420 tmp
= LLVMBuildAdd(builder
, curverts
, ctx
->ac
.i32_1
, "");
421 LLVMBuildStore(builder
, tmp
, ctx
->gs_curprim_verts
[stream
]);
423 LLVMValueRef gep_idx
[3] = {
424 ctx
->ac
.i32_0
, /* implied C-style array */
425 ctx
->ac
.i32_1
, /* second struct entry */
426 LLVMConstInt(ctx
->ac
.i32
, stream
, false),
428 const LLVMValueRef primflagptr
=
429 LLVMBuildGEP(builder
, vertexptr
, gep_idx
, 3, "");
431 tmp
= LLVMBuildZExt(builder
, iscompleteprim
, ctx
->ac
.i8
, "");
432 LLVMBuildStore(builder
, tmp
, primflagptr
);
434 lp_build_endif(&if_state
);
437 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context
*ctx
)
439 const struct si_shader_selector
*sel
= ctx
->shader
->selector
;
440 const struct tgsi_shader_info
*info
= &sel
->info
;
441 const unsigned verts_per_prim
= u_vertices_per_prim(sel
->gs_output_prim
);
442 LLVMBuilderRef builder
= ctx
->ac
.builder
;
443 LLVMValueRef i8_0
= LLVMConstInt(ctx
->ac
.i8
, 0, false);
444 LLVMValueRef tmp
, tmp2
;
446 /* Zero out remaining (non-emitted) primitive flags.
448 * Note: Alternatively, we could pass the relevant gs_next_vertex to
449 * the emit threads via LDS. This is likely worse in the expected
450 * typical case where each GS thread emits the full set of
453 for (unsigned stream
= 0; stream
< 4; ++stream
) {
454 if (!info
->num_stream_output_components
[stream
])
457 const LLVMValueRef gsthread
= get_thread_id_in_tg(ctx
);
459 ac_build_bgnloop(&ctx
->ac
, 5100);
461 const LLVMValueRef vertexidx
=
462 LLVMBuildLoad(builder
, ctx
->gs_next_vertex
[stream
], "");
463 tmp
= LLVMBuildICmp(builder
, LLVMIntUGE
, vertexidx
,
464 LLVMConstInt(ctx
->ac
.i32
, sel
->gs_max_out_vertices
, false), "");
465 ac_build_ifcc(&ctx
->ac
, tmp
, 5101);
466 ac_build_break(&ctx
->ac
);
467 ac_build_endif(&ctx
->ac
, 5101);
469 tmp
= LLVMBuildAdd(builder
, vertexidx
, ctx
->ac
.i32_1
, "");
470 LLVMBuildStore(builder
, tmp
, ctx
->gs_next_vertex
[stream
]);
472 tmp
= ngg_gs_emit_vertex_ptr(ctx
, gsthread
, vertexidx
);
473 LLVMValueRef gep_idx
[3] = {
474 ctx
->ac
.i32_0
, /* implied C-style array */
475 ctx
->ac
.i32_1
, /* second entry of struct */
476 LLVMConstInt(ctx
->ac
.i32
, stream
, false),
478 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
479 LLVMBuildStore(builder
, i8_0
, tmp
);
481 ac_build_endloop(&ctx
->ac
, 5100);
484 lp_build_endif(&ctx
->merged_wrap_if_state
);
486 ac_build_s_barrier(&ctx
->ac
);
488 const LLVMValueRef tid
= get_thread_id_in_tg(ctx
);
489 LLVMValueRef num_emit_threads
= ngg_get_prim_cnt(ctx
);
491 /* TODO: streamout */
495 /* Determine vertex liveness. */
496 LLVMValueRef vertliveptr
= lp_build_alloca(&ctx
->gallivm
, ctx
->ac
.i1
, "vertexlive");
498 tmp
= LLVMBuildICmp(builder
, LLVMIntULT
, tid
, num_emit_threads
, "");
499 ac_build_ifcc(&ctx
->ac
, tmp
, 5120);
501 for (unsigned i
= 0; i
< verts_per_prim
; ++i
) {
502 const LLVMValueRef primidx
=
503 LLVMBuildAdd(builder
, tid
,
504 LLVMConstInt(ctx
->ac
.i32
, i
, false), "");
507 tmp
= LLVMBuildICmp(builder
, LLVMIntULT
, primidx
, num_emit_threads
, "");
508 ac_build_ifcc(&ctx
->ac
, tmp
, 5121 + i
);
511 /* Load primitive liveness */
512 tmp
= ngg_gs_vertex_ptr(ctx
, primidx
);
513 LLVMValueRef gep_idx
[3] = {
514 ctx
->ac
.i32_0
, /* implicit C-style array */
515 ctx
->ac
.i32_1
, /* second value of struct */
516 ctx
->ac
.i32_0
, /* stream 0 */
518 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
519 tmp
= LLVMBuildLoad(builder
, tmp
, "");
520 const LLVMValueRef primlive
=
521 LLVMBuildTrunc(builder
, tmp
, ctx
->ac
.i1
, "");
523 tmp
= LLVMBuildLoad(builder
, vertliveptr
, "");
524 tmp
= LLVMBuildOr(builder
, tmp
, primlive
, ""),
525 LLVMBuildStore(builder
, tmp
, vertliveptr
);
528 ac_build_endif(&ctx
->ac
, 5121 + i
);
531 ac_build_endif(&ctx
->ac
, 5120);
533 /* Inclusive scan addition across the current wave. */
534 LLVMValueRef vertlive
= LLVMBuildLoad(builder
, vertliveptr
, "");
535 struct ac_wg_scan vertlive_scan
= {};
536 vertlive_scan
.op
= nir_op_iadd
;
537 vertlive_scan
.enable_reduce
= true;
538 vertlive_scan
.enable_exclusive
= true;
539 vertlive_scan
.src
= vertlive
;
540 vertlive_scan
.scratch
= ac_build_gep0(&ctx
->ac
, ctx
->gs_ngg_scratch
, ctx
->i32_0
);
541 vertlive_scan
.waveidx
= get_wave_id_in_tg(ctx
);
542 vertlive_scan
.numwaves
= get_tgsize(ctx
);
543 vertlive_scan
.maxwaves
= 8;
545 ac_build_wg_scan(&ctx
->ac
, &vertlive_scan
);
547 /* Skip all exports (including index exports) when possible. At least on
548 * early gfx10 revisions this is also to avoid hangs.
550 LLVMValueRef have_exports
=
551 LLVMBuildICmp(builder
, LLVMIntNE
, vertlive_scan
.result_reduce
, ctx
->ac
.i32_0
, "");
553 LLVMBuildSelect(builder
, have_exports
, num_emit_threads
, ctx
->ac
.i32_0
, "");
555 /* Allocate export space. Send this message as early as possible, to
556 * hide the latency of the SQ <-> SPI roundtrip.
558 * Note: We could consider compacting primitives for export as well.
559 * PA processes 1 non-null prim / clock, but it fetches 4 DW of
560 * prim data per clock and skips null primitives at no additional
561 * cost. So compacting primitives can only be beneficial when
562 * there are 4 or more contiguous null primitives in the export
563 * (in the common case of single-dword prim exports).
565 build_sendmsg_gs_alloc_req(ctx
, vertlive_scan
.result_reduce
, num_emit_threads
);
567 /* Setup the reverse vertex compaction permutation. We re-use stream 1
568 * of the primitive liveness flags, relying on the fact that each
569 * threadgroup can have at most 256 threads. */
570 ac_build_ifcc(&ctx
->ac
, vertlive
, 5130);
572 tmp
= ngg_gs_vertex_ptr(ctx
, vertlive_scan
.result_exclusive
);
573 LLVMValueRef gep_idx
[3] = {
574 ctx
->ac
.i32_0
, /* implicit C-style array */
575 ctx
->ac
.i32_1
, /* second value of struct */
576 ctx
->ac
.i32_1
, /* stream 1 */
578 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
579 tmp2
= LLVMBuildTrunc(builder
, tid
, ctx
->ac
.i8
, "");
580 LLVMBuildStore(builder
, tmp2
, tmp
);
582 ac_build_endif(&ctx
->ac
, 5130);
584 ac_build_s_barrier(&ctx
->ac
);
586 /* Export primitive data */
587 tmp
= LLVMBuildICmp(builder
, LLVMIntULT
, tid
, num_emit_threads
, "");
588 ac_build_ifcc(&ctx
->ac
, tmp
, 5140);
590 struct ngg_prim prim
= {};
591 prim
.num_vertices
= verts_per_prim
;
593 tmp
= ngg_gs_vertex_ptr(ctx
, tid
);
594 LLVMValueRef gep_idx
[3] = {
595 ctx
->ac
.i32_0
, /* implicit C-style array */
596 ctx
->ac
.i32_1
, /* second value of struct */
597 ctx
->ac
.i32_0
, /* primflag */
599 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
600 tmp
= LLVMBuildLoad(builder
, tmp
, "");
601 prim
.isnull
= LLVMBuildICmp(builder
, LLVMIntEQ
, tmp
,
602 LLVMConstInt(ctx
->ac
.i8
, 0, false), "");
604 for (unsigned i
= 0; i
< verts_per_prim
; ++i
) {
605 prim
.index
[i
] = LLVMBuildSub(builder
, vertlive_scan
.result_exclusive
,
606 LLVMConstInt(ctx
->ac
.i32
, verts_per_prim
- i
- 1, false), "");
607 prim
.edgeflag
[i
] = ctx
->ac
.i1false
;
610 build_export_prim(ctx
, &prim
);
612 ac_build_endif(&ctx
->ac
, 5140);
614 /* Export position and parameter data */
615 tmp
= LLVMBuildICmp(builder
, LLVMIntULT
, tid
, vertlive_scan
.result_reduce
, "");
616 ac_build_ifcc(&ctx
->ac
, tmp
, 5145);
618 struct si_shader_output_values
*outputs
= NULL
;
619 outputs
= MALLOC(info
->num_outputs
* sizeof(outputs
[0]));
621 tmp
= ngg_gs_vertex_ptr(ctx
, tid
);
622 LLVMValueRef gep_idx
[3] = {
623 ctx
->ac
.i32_0
, /* implicit C-style array */
624 ctx
->ac
.i32_1
, /* second value of struct */
625 ctx
->ac
.i32_1
, /* stream 1: source data index */
627 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
628 tmp
= LLVMBuildLoad(builder
, tmp
, "");
629 tmp
= LLVMBuildZExt(builder
, tmp
, ctx
->ac
.i32
, "");
630 const LLVMValueRef vertexptr
= ngg_gs_vertex_ptr(ctx
, tmp
);
632 unsigned out_idx
= 0;
633 gep_idx
[1] = ctx
->ac
.i32_0
;
634 for (unsigned i
= 0; i
< info
->num_outputs
; i
++) {
635 outputs
[i
].semantic_name
= info
->output_semantic_name
[i
];
636 outputs
[i
].semantic_index
= info
->output_semantic_index
[i
];
638 for (unsigned j
= 0; j
< 4; j
++, out_idx
++) {
639 gep_idx
[2] = LLVMConstInt(ctx
->ac
.i32
, out_idx
, false);
640 tmp
= LLVMBuildGEP(builder
, vertexptr
, gep_idx
, 3, "");
641 tmp
= LLVMBuildLoad(builder
, tmp
, "");
642 outputs
[i
].values
[j
] = ac_to_float(&ctx
->ac
, tmp
);
643 outputs
[i
].vertex_stream
[j
] =
644 (info
->output_streams
[i
] >> (2 * j
)) & 3;
648 si_llvm_export_vs(ctx
, outputs
, info
->num_outputs
);
652 ac_build_endif(&ctx
->ac
, 5145);