2 * Copyright 2019 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
27 #include "si_shader_internal.h"
29 #include "si_build_pm4.h"
30 #include "ac_llvm_cull.h"
32 #include "util/u_prim.h"
33 #include "util/u_suballoc.h"
34 #include "util/u_upload_mgr.h"
35 #include "util/fast_idiv_by_const.h"
38 * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
41 /* This file implements primitive culling using asynchronous compute.
42 * It's written to be GL conformant.
44 * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it
45 * in a compute shader. The shader processes 1 primitive/thread by invoking
46 * the VS for each vertex to get the positions, decomposes strips and fans
47 * into triangles (if needed), eliminates primitive restart (if needed),
48 * does (W<0) culling, face culling, view XY culling, zero-area and
49 * small-primitive culling, and generates a new index buffer that doesn't
50 * contain culled primitives.
52 * The index buffer is generated using the Ordered Count feature of GDS,
53 * which is an atomic counter that is incremented in the wavefront launch
54 * order, so that the original primitive order is preserved.
56 * Another GDS ordered counter is used to eliminate primitive restart indices.
57 * If a restart index lands on an even thread ID, the compute shader has to flip
58 * the primitive orientation of the whole following triangle strip. The primitive
59 * orientation has to be correct after strip and fan decomposition for two-sided
60 * shading to behave correctly. The decomposition also needs to be aware of
61 * which vertex is the provoking vertex for flat shading to behave correctly.
63 * IB = a GPU command buffer
65 * Both the compute and gfx IBs run in parallel sort of like CE and DE.
66 * The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND
67 * doesn't continue if its word isn't 0x80000000. Once compute shaders are
68 * finished culling, the last wave will write the final primitive count from
69 * GDS directly into the count word of the draw packet in the gfx IB, and
70 * a CS_DONE event will signal the REWIND packet to continue. It's really
71 * a direct draw with command buffer patching from the compute queue.
73 * The compute IB doesn't have to start when its corresponding gfx IB starts,
74 * but can start sooner. The compute IB is signaled to start after the last
75 * execution barrier in the *previous* gfx IB. This is handled as follows.
76 * The kernel GPU scheduler starts the compute IB after the previous gfx IB has
77 * started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that
78 * represents the barrier in the previous gfx IB.
81 * - Triangle strips and fans are decomposed into an indexed triangle list.
82 * The decomposition differs based on the provoking vertex state.
83 * - Instanced draws are converted into non-instanced draws for 16-bit indices.
84 * (InstanceID is stored in the high bits of VertexID and unpacked by VS)
85 * - Primitive restart is fully supported with triangle strips, including
86 * correct primitive orientation across multiple waves. (restart indices
87 * reset primitive orientation)
88 * - W<0 culling (W<0 is behind the viewer, sort of like near Z culling).
89 * - Back face culling, incl. culling zero-area / degenerate primitives.
91 * - View Z culling (disabled due to limited impact with perspective projection).
92 * - Small primitive culling for all MSAA modes and all quant modes.
94 * The following are not implemented:
95 * - ClipVertex/ClipDistance/CullDistance-based culling.
99 * Limitations (and unimplemented features that may be possible to implement):
100 * - Only triangles, triangle strips, and triangle fans are supported.
101 * - Primitive restart is only supported with triangle strips.
102 * - Instancing and primitive restart can't be used together.
103 * - Instancing is only supported with 16-bit indices and instance count <= 2^16.
104 * - The instance divisor buffer is unavailable, so all divisors must be
106 * - Multidraws where the vertex shader reads gl_DrawID are unsupported.
107 * - No support for tessellation and geometry shaders.
108 * (patch elimination where tess factors are 0 would be possible to implement)
109 * - The vertex shader must not contain memory stores.
110 * - All VS resources must not have a write usage in the command buffer.
111 * (TODO: all shader buffers currently set the write usage)
112 * - Bindless textures and images must not occur in the vertex shader.
114 * User data SGPR layout:
115 * INDEX_BUFFERS: pointer to constants
116 * 0..3: input index buffer - typed buffer view
117 * 4..7: output index buffer - typed buffer view
118 * 8..11: viewport state - scale.xy, translate.xy
119 * VERTEX_COUNTER: counter address or first primitive ID
120 * - If unordered memory counter: address of "count" in the draw packet
121 * and is incremented atomically by the shader.
122 * - If unordered GDS counter: address of "count" in GDS starting from 0,
123 * must be initialized to 0 before the dispatch.
124 * - If ordered GDS counter: the primitive ID that should reset the vertex
125 * counter to 0 in GDS
126 * LAST_WAVE_PRIM_ID: the primitive ID that should write the final vertex
127 * count to memory if using GDS ordered append
128 * VERTEX_COUNT_ADDR: where the last wave should write the vertex count if
129 * using GDS ordered append
130 * VS.VERTEX_BUFFERS: same value as VS
131 * VS.CONST_AND_SHADER_BUFFERS: same value as VS
132 * VS.SAMPLERS_AND_IMAGES: same value as VS
133 * VS.BASE_VERTEX: same value as VS
134 * VS.START_INSTANCE: same value as VS
135 * NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives
136 * per instance for instancing.
137 * NUM_PRIMS_UDIV_TERMS:
138 * - Bits [0:4]: "post_shift" for fast 31-bit division for instancing.
139 * - Bits [5:31]: The number of primitives per instance for computing the remainder.
140 * PRIMITIVE_RESTART_INDEX
141 * SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
144 * The code contains 3 codepaths:
145 * - Unordered memory counter (for debugging, random primitive order, no primitive restart)
146 * - Unordered GDS counter (for debugging, random primitive order, no primitive restart)
147 * - Ordered GDS counter (it preserves the primitive order)
149 * How to test primitive restart (the most complicated part because it needs
150 * to get the primitive orientation right):
151 * Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave
152 * primitive orientation flips with small draw calls, which is what most tests use.
153 * You can also enable draw call splitting into draw calls with just 2 primitives.
156 /* At least 256 is needed for the fastest wave launch rate from compute queues
157 * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
158 #define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */
159 #define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */
160 #define MAX_WAVES_PER_SH 0 /* no limit */
161 #define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */
162 /* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */
164 /* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */
165 #define VERTEX_COUNTER_GDS_MODE 2
166 #define GDS_SIZE_UNORDERED (4 * 1024) /* only for the unordered GDS counter */
168 /* Grouping compute dispatches for small draw calls: How many primitives from multiple
169 * draw calls to process by compute before signaling the gfx IB. This reduces the number
170 * of EOP events + REWIND packets, because they decrease performance. */
171 #define PRIMS_PER_BATCH (512 * 1024)
172 /* Draw call splitting at the packet level. This allows signaling the gfx IB
173 * for big draw calls sooner, but doesn't allow context flushes between packets.
174 * Primitive restart is supported. Only implemented for ordered append. */
175 #define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH
176 /* If there is not enough ring buffer space for the current IB, split draw calls into
177 * this number of primitives, so that we can flush the context and get free ring space. */
178 #define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH
180 /* Derived values. */
181 #define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
182 #define SPLIT_PRIMS_PACKET_LEVEL (VERTEX_COUNTER_GDS_MODE == 2 ? \
183 SPLIT_PRIMS_PACKET_LEVEL_VALUE : \
184 UINT_MAX & ~(THREADGROUP_SIZE - 1))
186 #define REWIND_SIGNAL_BIT 0x80000000
187 /* For emulating the rewind packet on CI. */
188 #define FORCE_REWIND_EMULATION 0
190 void si_initialize_prim_discard_tunables(struct si_context
*sctx
)
192 sctx
->prim_discard_vertex_count_threshold
= UINT_MAX
; /* disable */
194 if (sctx
->chip_class
== GFX6
|| /* SI support is not implemented */
195 !sctx
->screen
->info
.has_gds_ordered_append
||
196 sctx
->screen
->debug_flags
& DBG(NO_PD
) ||
197 /* If aux_context == NULL, we are initializing aux_context right now. */
198 !sctx
->screen
->aux_context
)
201 /* TODO: enable this after the GDS kernel memory management is fixed */
202 bool enable_on_pro_graphics_by_default
= false;
204 if (sctx
->screen
->debug_flags
& DBG(ALWAYS_PD
) ||
205 sctx
->screen
->debug_flags
& DBG(PD
) ||
206 (enable_on_pro_graphics_by_default
&&
207 sctx
->screen
->info
.is_pro_graphics
&&
208 (sctx
->family
== CHIP_BONAIRE
||
209 sctx
->family
== CHIP_HAWAII
||
210 sctx
->family
== CHIP_TONGA
||
211 sctx
->family
== CHIP_FIJI
||
212 sctx
->family
== CHIP_POLARIS10
||
213 sctx
->family
== CHIP_POLARIS11
||
214 sctx
->family
== CHIP_VEGA10
||
215 sctx
->family
== CHIP_VEGA20
))) {
216 sctx
->prim_discard_vertex_count_threshold
= 6000 * 3; /* 6K triangles */
218 if (sctx
->screen
->debug_flags
& DBG(ALWAYS_PD
))
219 sctx
->prim_discard_vertex_count_threshold
= 0; /* always enable */
221 const uint32_t MB
= 1024 * 1024;
222 const uint64_t GB
= 1024 * 1024 * 1024;
224 /* The total size is double this per context.
225 * Greater numbers allow bigger gfx IBs.
227 if (sctx
->screen
->info
.vram_size
<= 2 * GB
)
228 sctx
->index_ring_size_per_ib
= 64 * MB
;
229 else if (sctx
->screen
->info
.vram_size
<= 4 * GB
)
230 sctx
->index_ring_size_per_ib
= 128 * MB
;
232 sctx
->index_ring_size_per_ib
= 256 * MB
;
236 /* Opcode can be "add" or "swap". */
238 si_build_ds_ordered_op(struct si_shader_context
*ctx
, const char *opcode
,
239 LLVMValueRef m0
, LLVMValueRef value
, unsigned ordered_count_index
,
240 bool release
, bool done
)
242 LLVMValueRef args
[] = {
243 LLVMBuildIntToPtr(ctx
->ac
.builder
, m0
,
244 LLVMPointerType(ctx
->i32
, AC_ADDR_SPACE_GDS
), ""),
246 LLVMConstInt(ctx
->i32
, LLVMAtomicOrderingMonotonic
, 0), /* ordering */
247 ctx
->i32_0
, /* scope */
248 ctx
->i1false
, /* volatile */
249 LLVMConstInt(ctx
->i32
, ordered_count_index
, 0),
250 LLVMConstInt(ctx
->i1
, release
, 0),
251 LLVMConstInt(ctx
->i1
, done
, 0),
255 snprintf(intrinsic
, sizeof(intrinsic
), "llvm.amdgcn.ds.ordered.%s", opcode
);
256 return ac_build_intrinsic(&ctx
->ac
, intrinsic
, ctx
->i32
, args
, ARRAY_SIZE(args
), 0);
259 static LLVMValueRef
si_expand_32bit_pointer(struct si_shader_context
*ctx
, LLVMValueRef ptr
)
261 uint64_t hi
= (uint64_t)ctx
->screen
->info
.address32_hi
<< 32;
262 ptr
= LLVMBuildZExt(ctx
->ac
.builder
, ptr
, ctx
->i64
, "");
263 ptr
= LLVMBuildOr(ctx
->ac
.builder
, ptr
, LLVMConstInt(ctx
->i64
, hi
, 0), "");
264 return LLVMBuildIntToPtr(ctx
->ac
.builder
, ptr
,
265 LLVMPointerType(ctx
->i32
, AC_ADDR_SPACE_GLOBAL
), "");
268 struct si_thread0_section
{
269 struct si_shader_context
*ctx
;
270 LLVMValueRef vgpr_result
; /* a VGPR for the value on thread 0. */
271 LLVMValueRef saved_exec
;
274 /* Enter a section that only executes on thread 0. */
275 static void si_enter_thread0_section(struct si_shader_context
*ctx
,
276 struct si_thread0_section
*section
,
277 LLVMValueRef thread_id
)
280 section
->vgpr_result
= ac_build_alloca_undef(&ctx
->ac
, ctx
->i32
, "result0");
282 /* This IF has 4 instructions:
283 * v_and_b32_e32 v, 63, v ; get the thread ID
284 * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0
285 * s_and_saveexec_b64 s, vcc
286 * s_cbranch_execz BB0_4
288 * It could just be s_and_saveexec_b64 s, 1.
290 ac_build_ifcc(&ctx
->ac
,
291 LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntEQ
, thread_id
,
292 ctx
->i32_0
, ""), 12601);
295 /* Exit a section that only executes on thread 0 and broadcast the result
297 static void si_exit_thread0_section(struct si_thread0_section
*section
,
298 LLVMValueRef
*result
)
300 struct si_shader_context
*ctx
= section
->ctx
;
302 LLVMBuildStore(ctx
->ac
.builder
, *result
, section
->vgpr_result
);
304 ac_build_endif(&ctx
->ac
, 12601);
306 /* Broadcast the result from thread 0 to all threads. */
307 *result
= ac_build_readlane(&ctx
->ac
,
308 LLVMBuildLoad(ctx
->ac
.builder
, section
->vgpr_result
, ""), NULL
);
311 void si_build_prim_discard_compute_shader(struct si_shader_context
*ctx
)
313 struct si_shader_key
*key
= &ctx
->shader
->key
;
314 LLVMBuilderRef builder
= ctx
->ac
.builder
;
315 LLVMValueRef vs
= ctx
->main_fn
;
317 /* Always inline the VS function. */
318 ac_add_function_attr(ctx
->ac
.context
, vs
, -1, AC_FUNC_ATTR_ALWAYSINLINE
);
319 LLVMSetLinkage(vs
, LLVMPrivateLinkage
);
321 enum ac_arg_type const_desc_type
;
322 if (ctx
->shader
->selector
->info
.const_buffers_declared
== 1 &&
323 ctx
->shader
->selector
->info
.shader_buffers_declared
== 0)
324 const_desc_type
= AC_ARG_CONST_FLOAT_PTR
;
326 const_desc_type
= AC_ARG_CONST_DESC_PTR
;
328 memset(&ctx
->args
, 0, sizeof(ctx
->args
));
330 struct ac_arg param_index_buffers_and_constants
, param_vertex_counter
;
331 struct ac_arg param_vb_desc
, param_const_desc
;
332 struct ac_arg param_base_vertex
, param_start_instance
;
333 struct ac_arg param_block_id
, param_local_id
, param_ordered_wave_id
;
334 struct ac_arg param_restart_index
, param_smallprim_precision
;
335 struct ac_arg param_num_prims_udiv_multiplier
, param_num_prims_udiv_terms
;
336 struct ac_arg param_sampler_desc
, param_last_wave_prim_id
, param_vertex_count_addr
;
338 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_CONST_DESC_PTR
,
339 ¶m_index_buffers_and_constants
);
340 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, ¶m_vertex_counter
);
341 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, ¶m_last_wave_prim_id
);
342 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, ¶m_vertex_count_addr
);
343 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_CONST_DESC_PTR
,
345 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, const_desc_type
,
347 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_CONST_IMAGE_PTR
,
348 ¶m_sampler_desc
);
349 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, ¶m_base_vertex
);
350 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, ¶m_start_instance
);
351 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, ¶m_num_prims_udiv_multiplier
);
352 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, ¶m_num_prims_udiv_terms
);
353 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, ¶m_restart_index
);
354 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_FLOAT
, ¶m_smallprim_precision
);
356 /* Block ID and thread ID inputs. */
357 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, ¶m_block_id
);
358 if (VERTEX_COUNTER_GDS_MODE
== 2)
359 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, ¶m_ordered_wave_id
);
360 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, ¶m_local_id
);
362 /* Create the compute shader function. */
363 unsigned old_type
= ctx
->type
;
364 ctx
->type
= PIPE_SHADER_COMPUTE
;
365 si_create_function(ctx
, "prim_discard_cs", NULL
, 0, THREADGROUP_SIZE
);
366 ctx
->type
= old_type
;
368 if (VERTEX_COUNTER_GDS_MODE
== 1) {
369 ac_llvm_add_target_dep_function_attr(ctx
->main_fn
, "amdgpu-gds-size",
373 /* Assemble parameters for VS. */
374 LLVMValueRef vs_params
[16];
375 unsigned num_vs_params
= 0;
376 unsigned param_vertex_id
, param_instance_id
;
378 vs_params
[num_vs_params
++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs
, 0))); /* RW_BUFFERS */
379 vs_params
[num_vs_params
++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs
, 1))); /* BINDLESS */
380 vs_params
[num_vs_params
++] = ac_get_arg(&ctx
->ac
, param_const_desc
);
381 vs_params
[num_vs_params
++] = ac_get_arg(&ctx
->ac
, param_sampler_desc
);
382 vs_params
[num_vs_params
++] = LLVMConstInt(ctx
->i32
,
383 S_VS_STATE_INDEXED(key
->opt
.cs_indexed
), 0);
384 vs_params
[num_vs_params
++] = ac_get_arg(&ctx
->ac
, param_base_vertex
);
385 vs_params
[num_vs_params
++] = ac_get_arg(&ctx
->ac
, param_start_instance
);
386 vs_params
[num_vs_params
++] = ctx
->i32_0
; /* DrawID */
387 vs_params
[num_vs_params
++] = ac_get_arg(&ctx
->ac
, param_vb_desc
);
389 vs_params
[(param_vertex_id
= num_vs_params
++)] = NULL
; /* VertexID */
390 vs_params
[(param_instance_id
= num_vs_params
++)] = NULL
; /* InstanceID */
391 vs_params
[num_vs_params
++] = ctx
->i32_0
; /* unused (PrimID) */
392 vs_params
[num_vs_params
++] = ctx
->i32_0
; /* unused */
394 assert(num_vs_params
<= ARRAY_SIZE(vs_params
));
395 assert(num_vs_params
== LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs
))));
397 /* Load descriptors. (load 8 dwords at once) */
398 LLVMValueRef input_indexbuf
, output_indexbuf
, tmp
, desc
[8];
400 LLVMValueRef index_buffers_and_constants
= ac_get_arg(&ctx
->ac
, param_index_buffers_and_constants
);
401 tmp
= LLVMBuildPointerCast(builder
, index_buffers_and_constants
,
402 ac_array_in_const32_addr_space(ctx
->v8i32
), "");
403 tmp
= ac_build_load_to_sgpr(&ctx
->ac
, tmp
, ctx
->i32_0
);
405 for (unsigned i
= 0; i
< 8; i
++)
406 desc
[i
] = ac_llvm_extract_elem(&ctx
->ac
, tmp
, i
);
408 input_indexbuf
= ac_build_gather_values(&ctx
->ac
, desc
, 4);
409 output_indexbuf
= ac_build_gather_values(&ctx
->ac
, desc
+ 4, 4);
411 /* Compute PrimID and InstanceID. */
412 LLVMValueRef global_thread_id
=
413 ac_build_imad(&ctx
->ac
, ac_get_arg(&ctx
->ac
, param_block_id
),
414 LLVMConstInt(ctx
->i32
, THREADGROUP_SIZE
, 0),
415 ac_get_arg(&ctx
->ac
, param_local_id
));
416 LLVMValueRef prim_id
= global_thread_id
; /* PrimID within an instance */
417 LLVMValueRef instance_id
= ctx
->i32_0
;
419 if (key
->opt
.cs_instancing
) {
420 LLVMValueRef num_prims_udiv_terms
=
421 ac_get_arg(&ctx
->ac
, param_num_prims_udiv_terms
);
422 LLVMValueRef num_prims_udiv_multiplier
=
423 ac_get_arg(&ctx
->ac
, param_num_prims_udiv_multiplier
);
424 /* Unpack num_prims_udiv_terms. */
425 LLVMValueRef post_shift
= LLVMBuildAnd(builder
, num_prims_udiv_terms
,
426 LLVMConstInt(ctx
->i32
, 0x1f, 0), "");
427 LLVMValueRef prims_per_instance
= LLVMBuildLShr(builder
, num_prims_udiv_terms
,
428 LLVMConstInt(ctx
->i32
, 5, 0), "");
429 /* Divide the total prim_id by the number of prims per instance. */
430 instance_id
= ac_build_fast_udiv_u31_d_not_one(&ctx
->ac
, prim_id
,
431 num_prims_udiv_multiplier
,
433 /* Compute the remainder. */
434 prim_id
= LLVMBuildSub(builder
, prim_id
,
435 LLVMBuildMul(builder
, instance_id
,
436 prims_per_instance
, ""), "");
439 /* Generate indices (like a non-indexed draw call). */
440 LLVMValueRef index
[4] = {NULL
, NULL
, NULL
, LLVMGetUndef(ctx
->i32
)};
441 unsigned vertices_per_prim
= 3;
443 switch (key
->opt
.cs_prim_type
) {
444 case PIPE_PRIM_TRIANGLES
:
445 for (unsigned i
= 0; i
< 3; i
++) {
446 index
[i
] = ac_build_imad(&ctx
->ac
, prim_id
,
447 LLVMConstInt(ctx
->i32
, 3, 0),
448 LLVMConstInt(ctx
->i32
, i
, 0));
451 case PIPE_PRIM_TRIANGLE_STRIP
:
452 for (unsigned i
= 0; i
< 3; i
++) {
453 index
[i
] = LLVMBuildAdd(builder
, prim_id
,
454 LLVMConstInt(ctx
->i32
, i
, 0), "");
457 case PIPE_PRIM_TRIANGLE_FAN
:
458 /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
459 * and rasterizer as a normal triangle, so we need to put the provoking
460 * vertex into the correct index variable and preserve orientation at the same time.
461 * gl_VertexID is preserved, because it's equal to the index.
463 if (key
->opt
.cs_provoking_vertex_first
) {
464 index
[0] = LLVMBuildAdd(builder
, prim_id
, LLVMConstInt(ctx
->i32
, 1, 0), "");
465 index
[1] = LLVMBuildAdd(builder
, prim_id
, LLVMConstInt(ctx
->i32
, 2, 0), "");
466 index
[2] = ctx
->i32_0
;
468 index
[0] = ctx
->i32_0
;
469 index
[1] = LLVMBuildAdd(builder
, prim_id
, LLVMConstInt(ctx
->i32
, 1, 0), "");
470 index
[2] = LLVMBuildAdd(builder
, prim_id
, LLVMConstInt(ctx
->i32
, 2, 0), "");
474 unreachable("unexpected primitive type");
478 if (key
->opt
.cs_indexed
) {
479 for (unsigned i
= 0; i
< 3; i
++) {
480 index
[i
] = ac_build_buffer_load_format(&ctx
->ac
, input_indexbuf
,
481 index
[i
], ctx
->i32_0
, 1,
483 index
[i
] = ac_to_integer(&ctx
->ac
, index
[i
]);
487 LLVMValueRef ordered_wave_id
= ac_get_arg(&ctx
->ac
, param_ordered_wave_id
);
489 /* Extract the ordered wave ID. */
490 if (VERTEX_COUNTER_GDS_MODE
== 2) {
491 ordered_wave_id
= LLVMBuildLShr(builder
, ordered_wave_id
,
492 LLVMConstInt(ctx
->i32
, 6, 0), "");
493 ordered_wave_id
= LLVMBuildAnd(builder
, ordered_wave_id
,
494 LLVMConstInt(ctx
->i32
, 0xfff, 0), "");
496 LLVMValueRef thread_id
=
497 LLVMBuildAnd(builder
, ac_get_arg(&ctx
->ac
, param_local_id
),
498 LLVMConstInt(ctx
->i32
, 63, 0), "");
500 /* Every other triangle in a strip has a reversed vertex order, so we
501 * need to swap vertices of odd primitives to get the correct primitive
502 * orientation when converting triangle strips to triangles. Primitive
503 * restart complicates it, because a strip can start anywhere.
505 LLVMValueRef prim_restart_accepted
= ctx
->i1true
;
506 LLVMValueRef vertex_counter
= ac_get_arg(&ctx
->ac
, param_vertex_counter
);
508 if (key
->opt
.cs_prim_type
== PIPE_PRIM_TRIANGLE_STRIP
) {
509 /* Without primitive restart, odd primitives have reversed orientation.
510 * Only primitive restart can flip it with respect to the first vertex
513 LLVMValueRef first_is_odd
= ctx
->i1false
;
515 /* Handle primitive restart. */
516 if (key
->opt
.cs_primitive_restart
) {
517 /* Get the GDS primitive restart continue flag and clear
518 * the flag in vertex_counter. This flag is used when the draw
519 * call was split and we need to load the primitive orientation
520 * flag from GDS for the first wave too.
522 LLVMValueRef gds_prim_restart_continue
=
523 LLVMBuildLShr(builder
, vertex_counter
,
524 LLVMConstInt(ctx
->i32
, 31, 0), "");
525 gds_prim_restart_continue
=
526 LLVMBuildTrunc(builder
, gds_prim_restart_continue
, ctx
->i1
, "");
527 vertex_counter
= LLVMBuildAnd(builder
, vertex_counter
,
528 LLVMConstInt(ctx
->i32
, 0x7fffffff, 0), "");
530 LLVMValueRef index0_is_reset
;
532 for (unsigned i
= 0; i
< 3; i
++) {
533 LLVMValueRef not_reset
= LLVMBuildICmp(builder
, LLVMIntNE
, index
[i
],
534 ac_get_arg(&ctx
->ac
, param_restart_index
),
537 index0_is_reset
= LLVMBuildNot(builder
, not_reset
, "");
538 prim_restart_accepted
= LLVMBuildAnd(builder
, prim_restart_accepted
,
542 /* If the previous waves flip the primitive orientation
543 * of the current triangle strip, it will be stored in GDS.
545 * Sometimes the correct orientation is not needed, in which case
546 * we don't need to execute this.
548 if (key
->opt
.cs_need_correct_orientation
&& VERTEX_COUNTER_GDS_MODE
== 2) {
549 /* If there are reset indices in this wave, get the thread index
550 * where the most recent strip starts relative to each thread.
552 LLVMValueRef preceding_threads_mask
=
553 LLVMBuildSub(builder
,
554 LLVMBuildShl(builder
, ctx
->ac
.i64_1
,
555 LLVMBuildZExt(builder
, thread_id
, ctx
->i64
, ""), ""),
558 LLVMValueRef reset_threadmask
= ac_get_i1_sgpr_mask(&ctx
->ac
, index0_is_reset
);
559 LLVMValueRef preceding_reset_threadmask
=
560 LLVMBuildAnd(builder
, reset_threadmask
, preceding_threads_mask
, "");
561 LLVMValueRef strip_start
=
562 ac_build_umsb(&ctx
->ac
, preceding_reset_threadmask
, NULL
);
563 strip_start
= LLVMBuildAdd(builder
, strip_start
, ctx
->i32_1
, "");
565 /* This flips the orientatino based on reset indices within this wave only. */
566 first_is_odd
= LLVMBuildTrunc(builder
, strip_start
, ctx
->i1
, "");
568 LLVMValueRef last_strip_start
, prev_wave_state
, ret
, tmp
;
569 LLVMValueRef is_first_wave
, current_wave_resets_index
;
571 /* Get the thread index where the last strip starts in this wave.
573 * If the last strip doesn't start in this wave, the thread index
576 * If the last strip starts in the next wave, the thread index will
579 last_strip_start
= ac_build_umsb(&ctx
->ac
, reset_threadmask
, NULL
);
580 last_strip_start
= LLVMBuildAdd(builder
, last_strip_start
, ctx
->i32_1
, "");
582 struct si_thread0_section section
;
583 si_enter_thread0_section(ctx
, §ion
, thread_id
);
585 /* This must be done in the thread 0 section, because
586 * we expect PrimID to be 0 for the whole first wave
587 * in this expression.
589 * NOTE: This will need to be different if we wanna support
590 * instancing with primitive restart.
592 is_first_wave
= LLVMBuildICmp(builder
, LLVMIntEQ
, prim_id
, ctx
->i32_0
, "");
593 is_first_wave
= LLVMBuildAnd(builder
, is_first_wave
,
594 LLVMBuildNot(builder
,
595 gds_prim_restart_continue
, ""), "");
596 current_wave_resets_index
= LLVMBuildICmp(builder
, LLVMIntNE
,
597 last_strip_start
, ctx
->i32_0
, "");
599 ret
= ac_build_alloca_undef(&ctx
->ac
, ctx
->i32
, "prev_state");
601 /* Save the last strip start primitive index in GDS and read
602 * the value that previous waves stored.
604 * if (is_first_wave || current_wave_resets_strip)
605 * // Read the value that previous waves stored and store a new one.
606 * first_is_odd = ds.ordered.swap(last_strip_start);
608 * // Just read the value that previous waves stored.
609 * first_is_odd = ds.ordered.add(0);
611 ac_build_ifcc(&ctx
->ac
,
612 LLVMBuildOr(builder
, is_first_wave
,
613 current_wave_resets_index
, ""), 12602);
615 /* The GDS address is always 0 with ordered append. */
616 tmp
= si_build_ds_ordered_op(ctx
, "swap",
617 ordered_wave_id
, last_strip_start
,
619 LLVMBuildStore(builder
, tmp
, ret
);
621 ac_build_else(&ctx
->ac
, 12603);
623 /* Just read the value from GDS. */
624 tmp
= si_build_ds_ordered_op(ctx
, "add",
625 ordered_wave_id
, ctx
->i32_0
,
627 LLVMBuildStore(builder
, tmp
, ret
);
629 ac_build_endif(&ctx
->ac
, 12602);
631 prev_wave_state
= LLVMBuildLoad(builder
, ret
, "");
632 /* Ignore the return value if this is the first wave. */
633 prev_wave_state
= LLVMBuildSelect(builder
, is_first_wave
,
634 ctx
->i32_0
, prev_wave_state
, "");
635 si_exit_thread0_section(§ion
, &prev_wave_state
);
636 prev_wave_state
= LLVMBuildTrunc(builder
, prev_wave_state
, ctx
->i1
, "");
638 /* If the strip start appears to be on thread 0 for the current primitive
639 * (meaning the reset index is not present in this wave and might have
640 * appeared in previous waves), use the value from GDS to determine
641 * primitive orientation.
643 * If the strip start is in this wave for the current primitive, use
644 * the value from the current wave to determine primitive orientation.
646 LLVMValueRef strip_start_is0
= LLVMBuildICmp(builder
, LLVMIntEQ
,
647 strip_start
, ctx
->i32_0
, "");
648 first_is_odd
= LLVMBuildSelect(builder
, strip_start_is0
, prev_wave_state
,
652 /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
653 LLVMValueRef prim_is_odd
=
654 LLVMBuildXor(builder
, first_is_odd
,
655 LLVMBuildTrunc(builder
, thread_id
, ctx
->i1
, ""), "");
657 /* Determine the primitive orientation.
658 * Only swap the vertices that are not the provoking vertex. We need to keep
659 * the provoking vertex in place.
661 if (key
->opt
.cs_provoking_vertex_first
) {
662 LLVMValueRef index1
= index
[1];
663 LLVMValueRef index2
= index
[2];
664 index
[1] = LLVMBuildSelect(builder
, prim_is_odd
, index2
, index1
, "");
665 index
[2] = LLVMBuildSelect(builder
, prim_is_odd
, index1
, index2
, "");
667 LLVMValueRef index0
= index
[0];
668 LLVMValueRef index1
= index
[1];
669 index
[0] = LLVMBuildSelect(builder
, prim_is_odd
, index1
, index0
, "");
670 index
[1] = LLVMBuildSelect(builder
, prim_is_odd
, index0
, index1
, "");
674 /* Execute the vertex shader for each vertex to get vertex positions. */
675 LLVMValueRef pos
[3][4];
676 for (unsigned i
= 0; i
< vertices_per_prim
; i
++) {
677 vs_params
[param_vertex_id
] = index
[i
];
678 vs_params
[param_instance_id
] = instance_id
;
680 LLVMValueRef ret
= ac_build_call(&ctx
->ac
, vs
, vs_params
, num_vs_params
);
681 for (unsigned chan
= 0; chan
< 4; chan
++)
682 pos
[i
][chan
] = LLVMBuildExtractValue(builder
, ret
, chan
, "");
685 /* Divide XYZ by W. */
686 for (unsigned i
= 0; i
< vertices_per_prim
; i
++) {
687 for (unsigned chan
= 0; chan
< 3; chan
++)
688 pos
[i
][chan
] = ac_build_fdiv(&ctx
->ac
, pos
[i
][chan
], pos
[i
][3]);
691 /* Load the viewport state. */
692 LLVMValueRef vp
= ac_build_load_invariant(&ctx
->ac
, index_buffers_and_constants
,
693 LLVMConstInt(ctx
->i32
, 2, 0));
694 vp
= LLVMBuildBitCast(builder
, vp
, ctx
->v4f32
, "");
695 LLVMValueRef vp_scale
[2], vp_translate
[2];
696 vp_scale
[0] = ac_llvm_extract_elem(&ctx
->ac
, vp
, 0);
697 vp_scale
[1] = ac_llvm_extract_elem(&ctx
->ac
, vp
, 1);
698 vp_translate
[0] = ac_llvm_extract_elem(&ctx
->ac
, vp
, 2);
699 vp_translate
[1] = ac_llvm_extract_elem(&ctx
->ac
, vp
, 3);
702 struct ac_cull_options options
= {};
703 options
.cull_front
= key
->opt
.cs_cull_front
;
704 options
.cull_back
= key
->opt
.cs_cull_back
;
705 options
.cull_view_xy
= true;
706 options
.cull_view_near_z
= CULL_Z
&& key
->opt
.cs_cull_z
;
707 options
.cull_view_far_z
= CULL_Z
&& key
->opt
.cs_cull_z
;
708 options
.cull_small_prims
= true;
709 options
.cull_zero_area
= true;
710 options
.cull_w
= true;
711 options
.use_halfz_clip_space
= key
->opt
.cs_halfz_clip_space
;
713 LLVMValueRef accepted
=
714 ac_cull_triangle(&ctx
->ac
, pos
, prim_restart_accepted
,
715 vp_scale
, vp_translate
,
716 ac_get_arg(&ctx
->ac
, param_smallprim_precision
),
719 LLVMValueRef accepted_threadmask
= ac_get_i1_sgpr_mask(&ctx
->ac
, accepted
);
721 /* Count the number of active threads by doing bitcount(accepted). */
722 LLVMValueRef num_prims_accepted
=
723 ac_build_intrinsic(&ctx
->ac
, "llvm.ctpop.i64", ctx
->i64
,
724 &accepted_threadmask
, 1, AC_FUNC_ATTR_READNONE
);
725 num_prims_accepted
= LLVMBuildTrunc(builder
, num_prims_accepted
, ctx
->i32
, "");
729 /* Execute atomic_add on the vertex count. */
730 struct si_thread0_section section
;
731 si_enter_thread0_section(ctx
, §ion
, thread_id
);
733 if (VERTEX_COUNTER_GDS_MODE
== 0) {
734 LLVMValueRef num_indices
= LLVMBuildMul(builder
, num_prims_accepted
,
735 LLVMConstInt(ctx
->i32
, vertices_per_prim
, 0), "");
736 vertex_counter
= si_expand_32bit_pointer(ctx
, vertex_counter
);
737 start
= LLVMBuildAtomicRMW(builder
, LLVMAtomicRMWBinOpAdd
,
738 vertex_counter
, num_indices
,
739 LLVMAtomicOrderingMonotonic
, false);
740 } else if (VERTEX_COUNTER_GDS_MODE
== 1) {
741 LLVMValueRef num_indices
= LLVMBuildMul(builder
, num_prims_accepted
,
742 LLVMConstInt(ctx
->i32
, vertices_per_prim
, 0), "");
743 vertex_counter
= LLVMBuildIntToPtr(builder
, vertex_counter
,
744 LLVMPointerType(ctx
->i32
, AC_ADDR_SPACE_GDS
), "");
745 start
= LLVMBuildAtomicRMW(builder
, LLVMAtomicRMWBinOpAdd
,
746 vertex_counter
, num_indices
,
747 LLVMAtomicOrderingMonotonic
, false);
748 } else if (VERTEX_COUNTER_GDS_MODE
== 2) {
749 LLVMValueRef tmp_store
= ac_build_alloca_undef(&ctx
->ac
, ctx
->i32
, "");
751 /* If the draw call was split into multiple subdraws, each using
752 * a separate draw packet, we need to start counting from 0 for
753 * the first compute wave of the subdraw.
755 * vertex_counter contains the primitive ID of the first thread
758 * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
760 LLVMValueRef is_first_wave
=
761 LLVMBuildICmp(builder
, LLVMIntEQ
, global_thread_id
,
764 /* Store the primitive count for ordered append, not vertex count.
765 * The idea is to avoid GDS initialization via CP DMA. The shader
766 * effectively stores the first count using "swap".
769 * ds.ordered.swap(num_prims_accepted); // store the first primitive count
772 * previous = ds.ordered.add(num_prims_accepted) // add the primitive count
775 ac_build_ifcc(&ctx
->ac
, is_first_wave
, 12604);
777 /* The GDS address is always 0 with ordered append. */
778 si_build_ds_ordered_op(ctx
, "swap", ordered_wave_id
,
779 num_prims_accepted
, 0, true, true);
780 LLVMBuildStore(builder
, ctx
->i32_0
, tmp_store
);
782 ac_build_else(&ctx
->ac
, 12605);
784 LLVMBuildStore(builder
,
785 si_build_ds_ordered_op(ctx
, "add", ordered_wave_id
,
786 num_prims_accepted
, 0,
790 ac_build_endif(&ctx
->ac
, 12604);
792 start
= LLVMBuildLoad(builder
, tmp_store
, "");
795 si_exit_thread0_section(§ion
, &start
);
797 /* Write the final vertex count to memory. An EOS/EOP event could do this,
798 * but those events are super slow and should be avoided if performance
799 * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
802 if (VERTEX_COUNTER_GDS_MODE
== 2) {
803 ac_build_ifcc(&ctx
->ac
,
804 LLVMBuildICmp(builder
, LLVMIntEQ
, global_thread_id
,
805 ac_get_arg(&ctx
->ac
, param_last_wave_prim_id
), ""),
807 LLVMValueRef count
= LLVMBuildAdd(builder
, start
, num_prims_accepted
, "");
808 count
= LLVMBuildMul(builder
, count
,
809 LLVMConstInt(ctx
->i32
, vertices_per_prim
, 0), "");
811 /* GFX8 needs to disable caching, so that the CP can see the stored value.
812 * MTYPE=3 bypasses TC L2.
814 if (ctx
->screen
->info
.chip_class
<= GFX8
) {
815 LLVMValueRef desc
[] = {
816 ac_get_arg(&ctx
->ac
, param_vertex_count_addr
),
817 LLVMConstInt(ctx
->i32
,
818 S_008F04_BASE_ADDRESS_HI(ctx
->screen
->info
.address32_hi
), 0),
819 LLVMConstInt(ctx
->i32
, 4, 0),
820 LLVMConstInt(ctx
->i32
, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32
) |
821 S_008F0C_MTYPE(3 /* uncached */), 0),
823 LLVMValueRef rsrc
= ac_build_gather_values(&ctx
->ac
, desc
, 4);
824 ac_build_buffer_store_dword(&ctx
->ac
, rsrc
, count
, 1, ctx
->i32_0
,
825 ctx
->i32_0
, 0, ac_glc
| ac_slc
, false);
827 LLVMBuildStore(builder
, count
,
828 si_expand_32bit_pointer(ctx
,
830 param_vertex_count_addr
)));
832 ac_build_endif(&ctx
->ac
, 12606);
834 /* For unordered modes that increment a vertex count instead of
835 * primitive count, convert it into the primitive index.
837 start
= LLVMBuildUDiv(builder
, start
,
838 LLVMConstInt(ctx
->i32
, vertices_per_prim
, 0), "");
841 /* Now we need to store the indices of accepted primitives into
842 * the output index buffer.
844 ac_build_ifcc(&ctx
->ac
, accepted
, 16607);
846 /* Get the number of bits set before the index of this thread. */
847 LLVMValueRef prim_index
= ac_build_mbcnt(&ctx
->ac
, accepted_threadmask
);
849 /* We have lowered instancing. Pack the instance ID into vertex ID. */
850 if (key
->opt
.cs_instancing
) {
851 instance_id
= LLVMBuildShl(builder
, instance_id
,
852 LLVMConstInt(ctx
->i32
, 16, 0), "");
854 for (unsigned i
= 0; i
< vertices_per_prim
; i
++)
855 index
[i
] = LLVMBuildOr(builder
, index
[i
], instance_id
, "");
858 if (VERTEX_COUNTER_GDS_MODE
== 2) {
859 /* vertex_counter contains the first primitive ID
860 * for this dispatch. If the draw call was split into
861 * multiple subdraws, the first primitive ID is > 0
862 * for subsequent subdraws. Each subdraw uses a different
863 * portion of the output index buffer. Offset the store
864 * vindex by the first primitive ID to get the correct
865 * store address for the subdraw.
867 start
= LLVMBuildAdd(builder
, start
, vertex_counter
, "");
870 /* Write indices for accepted primitives. */
871 LLVMValueRef vindex
= LLVMBuildAdd(builder
, start
, prim_index
, "");
872 LLVMValueRef vdata
= ac_build_gather_values(&ctx
->ac
, index
, 3);
874 if (!ac_has_vec3_support(ctx
->ac
.chip_class
, true))
875 vdata
= ac_build_expand_to_vec4(&ctx
->ac
, vdata
, 3);
877 ac_build_buffer_store_format(&ctx
->ac
, output_indexbuf
, vdata
,
878 vindex
, ctx
->i32_0
, 3,
879 ac_glc
| (INDEX_STORES_USE_SLC
? ac_slc
: 0));
881 ac_build_endif(&ctx
->ac
, 16607);
883 LLVMBuildRetVoid(builder
);
886 /* Return false if the shader isn't ready. */
887 static bool si_shader_select_prim_discard_cs(struct si_context
*sctx
,
888 const struct pipe_draw_info
*info
,
889 bool primitive_restart
)
891 struct si_state_rasterizer
*rs
= sctx
->queued
.named
.rasterizer
;
892 struct si_shader_key key
;
894 /* Primitive restart needs ordered counters. */
895 assert(!primitive_restart
|| VERTEX_COUNTER_GDS_MODE
== 2);
896 assert(!primitive_restart
|| info
->instance_count
== 1);
898 memset(&key
, 0, sizeof(key
));
899 si_shader_selector_key_vs(sctx
, sctx
->vs_shader
.cso
, &key
, &key
.part
.vs
.prolog
);
900 assert(!key
.part
.vs
.prolog
.instance_divisor_is_fetched
);
902 key
.part
.vs
.prolog
.unpack_instance_id_from_vertex_id
= 0;
903 key
.opt
.vs_as_prim_discard_cs
= 1;
904 key
.opt
.cs_prim_type
= info
->mode
;
905 key
.opt
.cs_indexed
= info
->index_size
!= 0;
906 key
.opt
.cs_instancing
= info
->instance_count
> 1;
907 key
.opt
.cs_primitive_restart
= primitive_restart
;
908 key
.opt
.cs_provoking_vertex_first
= rs
->provoking_vertex_first
;
910 /* Primitive restart with triangle strips needs to preserve primitive
911 * orientation for cases where front and back primitive orientation matters.
913 if (primitive_restart
) {
914 struct si_shader_selector
*ps
= sctx
->ps_shader
.cso
;
916 key
.opt
.cs_need_correct_orientation
=
917 rs
->cull_front
!= rs
->cull_back
||
918 ps
->info
.uses_frontface
||
919 (rs
->two_side
&& ps
->info
.colors_read
);
922 if (rs
->rasterizer_discard
) {
923 /* Just for performance testing and analysis of trivial bottlenecks.
924 * This should result in a very short compute shader. */
925 key
.opt
.cs_cull_front
= 1;
926 key
.opt
.cs_cull_back
= 1;
928 key
.opt
.cs_cull_front
=
929 sctx
->viewports
.y_inverted
? rs
->cull_back
: rs
->cull_front
;
930 key
.opt
.cs_cull_back
=
931 sctx
->viewports
.y_inverted
? rs
->cull_front
: rs
->cull_back
;
934 if (!rs
->depth_clamp_any
&& CULL_Z
) {
935 key
.opt
.cs_cull_z
= 1;
936 key
.opt
.cs_halfz_clip_space
= rs
->clip_halfz
;
939 sctx
->cs_prim_discard_state
.cso
= sctx
->vs_shader
.cso
;
940 sctx
->cs_prim_discard_state
.current
= NULL
;
942 struct si_compiler_ctx_state compiler_state
;
943 compiler_state
.compiler
= &sctx
->compiler
;
944 compiler_state
.debug
= sctx
->debug
;
945 compiler_state
.is_debug_context
= sctx
->is_debug
;
947 return si_shader_select_with_key(sctx
->screen
, &sctx
->cs_prim_discard_state
,
948 &compiler_state
, &key
, -1, true) == 0 &&
949 /* Disallow compute shaders using the scratch buffer. */
950 sctx
->cs_prim_discard_state
.current
->config
.scratch_bytes_per_wave
== 0;
953 static bool si_initialize_prim_discard_cmdbuf(struct si_context
*sctx
)
955 if (sctx
->index_ring
)
958 if (!sctx
->prim_discard_compute_cs
) {
959 struct radeon_winsys
*ws
= sctx
->ws
;
960 unsigned gds_size
= VERTEX_COUNTER_GDS_MODE
== 1 ? GDS_SIZE_UNORDERED
:
961 VERTEX_COUNTER_GDS_MODE
== 2 ? 8 : 0;
962 unsigned num_oa_counters
= VERTEX_COUNTER_GDS_MODE
== 2 ? 2 : 0;
965 sctx
->gds
= ws
->buffer_create(ws
, gds_size
, 4,
966 RADEON_DOMAIN_GDS
, 0);
970 ws
->cs_add_buffer(sctx
->gfx_cs
, sctx
->gds
,
971 RADEON_USAGE_READWRITE
, 0, 0);
973 if (num_oa_counters
) {
975 sctx
->gds_oa
= ws
->buffer_create(ws
, num_oa_counters
,
976 1, RADEON_DOMAIN_OA
, 0);
980 ws
->cs_add_buffer(sctx
->gfx_cs
, sctx
->gds_oa
,
981 RADEON_USAGE_READWRITE
, 0, 0);
984 sctx
->prim_discard_compute_cs
=
985 ws
->cs_add_parallel_compute_ib(sctx
->gfx_cs
,
986 num_oa_counters
> 0);
987 if (!sctx
->prim_discard_compute_cs
)
991 if (!sctx
->index_ring
) {
993 si_aligned_buffer_create(sctx
->b
.screen
,
994 SI_RESOURCE_FLAG_UNMAPPABLE
,
996 sctx
->index_ring_size_per_ib
* 2,
997 sctx
->screen
->info
.pte_fragment_size
);
998 if (!sctx
->index_ring
)
1004 static bool si_check_ring_space(struct si_context
*sctx
, unsigned out_indexbuf_size
)
1006 return sctx
->index_ring_offset
+
1007 align(out_indexbuf_size
, sctx
->screen
->info
.tcc_cache_line_size
) <=
1008 sctx
->index_ring_size_per_ib
;
1011 enum si_prim_discard_outcome
1012 si_prepare_prim_discard_or_split_draw(struct si_context
*sctx
,
1013 const struct pipe_draw_info
*info
,
1014 bool primitive_restart
)
1016 /* If the compute shader compilation isn't finished, this returns false. */
1017 if (!si_shader_select_prim_discard_cs(sctx
, info
, primitive_restart
))
1018 return SI_PRIM_DISCARD_DISABLED
;
1020 if (!si_initialize_prim_discard_cmdbuf(sctx
))
1021 return SI_PRIM_DISCARD_DISABLED
;
1023 struct radeon_cmdbuf
*gfx_cs
= sctx
->gfx_cs
;
1024 unsigned prim
= info
->mode
;
1025 unsigned count
= info
->count
;
1026 unsigned instance_count
= info
->instance_count
;
1027 unsigned num_prims_per_instance
= u_decomposed_prims_for_vertices(prim
, count
);
1028 unsigned num_prims
= num_prims_per_instance
* instance_count
;
1029 unsigned out_indexbuf_size
= num_prims
* 12;
1030 bool ring_full
= !si_check_ring_space(sctx
, out_indexbuf_size
);
1031 const unsigned split_prims_draw_level
= SPLIT_PRIMS_DRAW_LEVEL
;
1033 /* Split draws at the draw call level if the ring is full. This makes
1034 * better use of the ring space.
1037 num_prims
> split_prims_draw_level
&&
1038 instance_count
== 1 && /* TODO: support splitting instanced draws */
1039 (1 << prim
) & ((1 << PIPE_PRIM_TRIANGLES
) |
1040 (1 << PIPE_PRIM_TRIANGLE_STRIP
))) {
1042 struct pipe_draw_info split_draw
= *info
;
1043 split_draw
.primitive_restart
= primitive_restart
;
1045 unsigned base_start
= split_draw
.start
;
1047 if (prim
== PIPE_PRIM_TRIANGLES
) {
1048 unsigned vert_count_per_subdraw
= split_prims_draw_level
* 3;
1049 assert(vert_count_per_subdraw
< count
);
1051 for (unsigned start
= 0; start
< count
; start
+= vert_count_per_subdraw
) {
1052 split_draw
.start
= base_start
+ start
;
1053 split_draw
.count
= MIN2(count
- start
, vert_count_per_subdraw
);
1055 sctx
->b
.draw_vbo(&sctx
->b
, &split_draw
);
1057 } else if (prim
== PIPE_PRIM_TRIANGLE_STRIP
) {
1058 /* No primitive pair can be split, because strips reverse orientation
1059 * for odd primitives. */
1060 STATIC_ASSERT(split_prims_draw_level
% 2 == 0);
1062 unsigned vert_count_per_subdraw
= split_prims_draw_level
;
1064 for (unsigned start
= 0; start
< count
- 2; start
+= vert_count_per_subdraw
) {
1065 split_draw
.start
= base_start
+ start
;
1066 split_draw
.count
= MIN2(count
- start
, vert_count_per_subdraw
+ 2);
1068 sctx
->b
.draw_vbo(&sctx
->b
, &split_draw
);
1071 primitive_restart
&&
1072 sctx
->cs_prim_discard_state
.current
->key
.opt
.cs_need_correct_orientation
)
1073 sctx
->preserve_prim_restart_gds_at_flush
= true;
1075 sctx
->preserve_prim_restart_gds_at_flush
= false;
1080 return SI_PRIM_DISCARD_DRAW_SPLIT
;
1083 /* Just quit if the draw call doesn't fit into the ring and can't be split. */
1084 if (out_indexbuf_size
> sctx
->index_ring_size_per_ib
) {
1085 if (SI_PRIM_DISCARD_DEBUG
)
1086 puts("PD failed: draw call too big, can't be split");
1087 return SI_PRIM_DISCARD_DISABLED
;
1090 unsigned num_subdraws
= DIV_ROUND_UP(num_prims
, SPLIT_PRIMS_PACKET_LEVEL
);
1091 unsigned need_compute_dw
= 11 /* shader */ + 34 /* first draw */ +
1092 24 * (num_subdraws
- 1) + /* subdraws */
1093 20; /* leave some space at the end */
1094 unsigned need_gfx_dw
= si_get_minimum_num_gfx_cs_dwords(sctx
);
1096 if (sctx
->chip_class
<= GFX7
|| FORCE_REWIND_EMULATION
)
1097 need_gfx_dw
+= 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
1099 need_gfx_dw
+= num_subdraws
* 8; /* use REWIND(2) + DRAW(6) */
1102 (VERTEX_COUNTER_GDS_MODE
== 1 && sctx
->compute_gds_offset
+ 8 > GDS_SIZE_UNORDERED
) ||
1103 !sctx
->ws
->cs_check_space(gfx_cs
, need_gfx_dw
, false)) {
1104 /* If the current IB is empty but the size is too small, add a NOP
1105 * packet to force a flush and get a bigger IB.
1107 if (!radeon_emitted(gfx_cs
, sctx
->initial_gfx_cs_size
) &&
1108 gfx_cs
->current
.cdw
+ need_gfx_dw
> gfx_cs
->current
.max_dw
) {
1109 radeon_emit(gfx_cs
, PKT3(PKT3_NOP
, 0, 0));
1110 radeon_emit(gfx_cs
, 0);
1113 si_flush_gfx_cs(sctx
, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW
, NULL
);
1116 /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
1117 struct radeon_cmdbuf
*cs
= sctx
->prim_discard_compute_cs
;
1118 ASSERTED
bool compute_has_space
= sctx
->ws
->cs_check_space(cs
, need_compute_dw
, false);
1119 assert(compute_has_space
);
1120 assert(si_check_ring_space(sctx
, out_indexbuf_size
));
1121 return SI_PRIM_DISCARD_ENABLED
;
1124 void si_compute_signal_gfx(struct si_context
*sctx
)
1126 struct radeon_cmdbuf
*cs
= sctx
->prim_discard_compute_cs
;
1127 unsigned writeback_L2_flags
= 0;
1129 /* The writeback L2 flags vary with each chip generation. */
1130 /* CI needs to flush vertex indices to memory. */
1131 if (sctx
->chip_class
<= GFX7
)
1132 writeback_L2_flags
= EVENT_TC_WB_ACTION_ENA
;
1133 else if (sctx
->chip_class
== GFX8
&& VERTEX_COUNTER_GDS_MODE
== 0)
1134 writeback_L2_flags
= EVENT_TC_WB_ACTION_ENA
| EVENT_TC_NC_ACTION_ENA
;
1136 if (!sctx
->compute_num_prims_in_batch
)
1139 assert(sctx
->compute_rewind_va
);
1141 /* After the queued dispatches are done and vertex counts are written to
1142 * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
1143 * the dispatches to finish, it only adds the CS_DONE event into the event
1146 si_cp_release_mem(sctx
, cs
, V_028A90_CS_DONE
, writeback_L2_flags
,
1147 sctx
->chip_class
<= GFX8
? EOP_DST_SEL_MEM
: EOP_DST_SEL_TC_L2
,
1148 writeback_L2_flags
? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM
:
1150 EOP_DATA_SEL_VALUE_32BIT
,
1152 sctx
->compute_rewind_va
|
1153 ((uint64_t)sctx
->screen
->info
.address32_hi
<< 32),
1154 REWIND_SIGNAL_BIT
, /* signaling value for the REWIND packet */
1157 sctx
->compute_rewind_va
= 0;
1158 sctx
->compute_num_prims_in_batch
= 0;
1161 /* Dispatch a primitive discard compute shader. */
1162 void si_dispatch_prim_discard_cs_and_draw(struct si_context
*sctx
,
1163 const struct pipe_draw_info
*info
,
1164 unsigned index_size
,
1165 unsigned base_vertex
,
1166 uint64_t input_indexbuf_va
,
1167 unsigned input_indexbuf_num_elements
)
1169 struct radeon_cmdbuf
*gfx_cs
= sctx
->gfx_cs
;
1170 struct radeon_cmdbuf
*cs
= sctx
->prim_discard_compute_cs
;
1171 unsigned num_prims_per_instance
= u_decomposed_prims_for_vertices(info
->mode
, info
->count
);
1172 if (!num_prims_per_instance
)
1175 unsigned num_prims
= num_prims_per_instance
* info
->instance_count
;
1176 unsigned vertices_per_prim
, output_indexbuf_format
;
1178 switch (info
->mode
) {
1179 case PIPE_PRIM_TRIANGLES
:
1180 case PIPE_PRIM_TRIANGLE_STRIP
:
1181 case PIPE_PRIM_TRIANGLE_FAN
:
1182 vertices_per_prim
= 3;
1183 output_indexbuf_format
= V_008F0C_BUF_DATA_FORMAT_32_32_32
;
1186 unreachable("unsupported primitive type");
1190 unsigned out_indexbuf_offset
;
1191 uint64_t output_indexbuf_size
= num_prims
* vertices_per_prim
* 4;
1192 bool first_dispatch
= !sctx
->prim_discard_compute_ib_initialized
;
1194 /* Initialize the compute IB if it's empty. */
1195 if (!sctx
->prim_discard_compute_ib_initialized
) {
1196 /* 1) State initialization. */
1197 sctx
->compute_gds_offset
= 0;
1198 sctx
->compute_ib_last_shader
= NULL
;
1200 if (sctx
->last_ib_barrier_fence
) {
1201 assert(!sctx
->last_ib_barrier_buf
);
1202 sctx
->ws
->cs_add_fence_dependency(gfx_cs
,
1203 sctx
->last_ib_barrier_fence
,
1204 RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY
);
1207 /* 2) IB initialization. */
1209 /* This needs to be done at the beginning of IBs due to possible
1210 * TTM buffer moves in the kernel.
1212 * TODO: update for GFX10
1214 si_emit_surface_sync(sctx
, cs
,
1215 S_0085F0_TC_ACTION_ENA(1) |
1216 S_0085F0_TCL1_ACTION_ENA(1) |
1217 S_0301F0_TC_WB_ACTION_ENA(sctx
->chip_class
>= GFX8
) |
1218 S_0085F0_SH_ICACHE_ACTION_ENA(1) |
1219 S_0085F0_SH_KCACHE_ACTION_ENA(1));
1221 /* Restore the GDS prim restart counter if needed. */
1222 if (sctx
->preserve_prim_restart_gds_at_flush
) {
1223 si_cp_copy_data(sctx
, cs
,
1224 COPY_DATA_GDS
, NULL
, 4,
1225 COPY_DATA_SRC_MEM
, sctx
->wait_mem_scratch
, 4);
1228 si_emit_initial_compute_regs(sctx
, cs
);
1230 radeon_set_sh_reg(cs
, R_00B860_COMPUTE_TMPRING_SIZE
,
1231 S_00B860_WAVES(sctx
->scratch_waves
) |
1232 S_00B860_WAVESIZE(0)); /* no scratch */
1234 /* Only 1D grids are launched. */
1235 radeon_set_sh_reg_seq(cs
, R_00B820_COMPUTE_NUM_THREAD_Y
, 2);
1236 radeon_emit(cs
, S_00B820_NUM_THREAD_FULL(1) |
1237 S_00B820_NUM_THREAD_PARTIAL(1));
1238 radeon_emit(cs
, S_00B824_NUM_THREAD_FULL(1) |
1239 S_00B824_NUM_THREAD_PARTIAL(1));
1241 radeon_set_sh_reg_seq(cs
, R_00B814_COMPUTE_START_Y
, 2);
1245 /* Disable ordered alloc for OA resources. */
1246 for (unsigned i
= 0; i
< 2; i
++) {
1247 radeon_set_uconfig_reg_seq(cs
, R_031074_GDS_OA_CNTL
, 3);
1248 radeon_emit(cs
, S_031074_INDEX(i
));
1250 radeon_emit(cs
, S_03107C_ENABLE(0));
1253 if (sctx
->last_ib_barrier_buf
) {
1254 assert(!sctx
->last_ib_barrier_fence
);
1255 radeon_add_to_buffer_list(sctx
, gfx_cs
, sctx
->last_ib_barrier_buf
,
1256 RADEON_USAGE_READ
, RADEON_PRIO_FENCE
);
1257 si_cp_wait_mem(sctx
, cs
,
1258 sctx
->last_ib_barrier_buf
->gpu_address
+
1259 sctx
->last_ib_barrier_buf_offset
, 1, 1,
1260 WAIT_REG_MEM_EQUAL
);
1263 sctx
->prim_discard_compute_ib_initialized
= true;
1266 /* Allocate the output index buffer. */
1267 output_indexbuf_size
= align(output_indexbuf_size
,
1268 sctx
->screen
->info
.tcc_cache_line_size
);
1269 assert(sctx
->index_ring_offset
+ output_indexbuf_size
<= sctx
->index_ring_size_per_ib
);
1270 out_indexbuf_offset
= sctx
->index_ring_base
+ sctx
->index_ring_offset
;
1271 sctx
->index_ring_offset
+= output_indexbuf_size
;
1273 radeon_add_to_buffer_list(sctx
, gfx_cs
, sctx
->index_ring
, RADEON_USAGE_READWRITE
,
1274 RADEON_PRIO_SHADER_RW_BUFFER
);
1275 uint64_t out_indexbuf_va
= sctx
->index_ring
->gpu_address
+ out_indexbuf_offset
;
1277 /* Prepare index buffer descriptors. */
1278 struct si_resource
*indexbuf_desc
= NULL
;
1279 unsigned indexbuf_desc_offset
;
1280 unsigned desc_size
= 12 * 4;
1283 u_upload_alloc(sctx
->b
.const_uploader
, 0, desc_size
,
1284 si_optimal_tcc_alignment(sctx
, desc_size
),
1285 &indexbuf_desc_offset
, (struct pipe_resource
**)&indexbuf_desc
,
1287 radeon_add_to_buffer_list(sctx
, gfx_cs
, indexbuf_desc
, RADEON_USAGE_READ
,
1288 RADEON_PRIO_DESCRIPTORS
);
1290 /* Input index buffer. */
1291 desc
[0] = input_indexbuf_va
;
1292 desc
[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va
>> 32) |
1293 S_008F04_STRIDE(index_size
);
1294 desc
[2] = input_indexbuf_num_elements
* (sctx
->chip_class
== GFX8
? index_size
: 1);
1295 desc
[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) |
1296 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT
) |
1297 S_008F0C_DATA_FORMAT(index_size
== 1 ? V_008F0C_BUF_DATA_FORMAT_8
:
1298 index_size
== 2 ? V_008F0C_BUF_DATA_FORMAT_16
:
1299 V_008F0C_BUF_DATA_FORMAT_32
);
1301 /* Output index buffer. */
1302 desc
[4] = out_indexbuf_va
;
1303 desc
[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va
>> 32) |
1304 S_008F04_STRIDE(vertices_per_prim
* 4);
1305 desc
[6] = num_prims
* (sctx
->chip_class
== GFX8
? vertices_per_prim
* 4 : 1);
1306 desc
[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) |
1307 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y
) |
1308 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z
) |
1309 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0
) |
1310 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT
) |
1311 S_008F0C_DATA_FORMAT(output_indexbuf_format
);
1314 * This is needed by the small primitive culling, because it's done
1317 float scale
[2], translate
[2];
1319 scale
[0] = sctx
->viewports
.states
[0].scale
[0];
1320 scale
[1] = sctx
->viewports
.states
[0].scale
[1];
1321 translate
[0] = sctx
->viewports
.states
[0].translate
[0];
1322 translate
[1] = sctx
->viewports
.states
[0].translate
[1];
1324 /* The viewport shouldn't flip the X axis for the small prim culling to work. */
1325 assert(-scale
[0] + translate
[0] <= scale
[0] + translate
[0]);
1327 /* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
1328 * This is because the viewport transformation inverts the clip space
1329 * bounding box, so min becomes max, which breaks small primitive
1332 if (sctx
->viewports
.y_inverted
) {
1333 scale
[1] = -scale
[1];
1334 translate
[1] = -translate
[1];
1337 /* Scale the framebuffer up, so that samples become pixels and small
1338 * primitive culling is the same for all sample counts.
1339 * This only works with the standard DX sample positions, because
1340 * the samples are evenly spaced on both X and Y axes.
1342 unsigned num_samples
= sctx
->framebuffer
.nr_samples
;
1343 assert(num_samples
>= 1);
1345 for (unsigned i
= 0; i
< 2; i
++) {
1346 scale
[i
] *= num_samples
;
1347 translate
[i
] *= num_samples
;
1350 desc
[8] = fui(scale
[0]);
1351 desc
[9] = fui(scale
[1]);
1352 desc
[10] = fui(translate
[0]);
1353 desc
[11] = fui(translate
[1]);
1355 /* Better subpixel precision increases the efficiency of small
1356 * primitive culling. */
1357 unsigned quant_mode
= sctx
->viewports
.as_scissor
[0].quant_mode
;
1358 float small_prim_cull_precision
;
1360 if (quant_mode
== SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH
)
1361 small_prim_cull_precision
= num_samples
/ 4096.0;
1362 else if (quant_mode
== SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH
)
1363 small_prim_cull_precision
= num_samples
/ 1024.0;
1365 small_prim_cull_precision
= num_samples
/ 256.0;
1367 /* Set user data SGPRs. */
1368 /* This can't be greater than 14 if we want the fastest launch rate. */
1369 unsigned user_sgprs
= 13;
1371 uint64_t index_buffers_va
= indexbuf_desc
->gpu_address
+ indexbuf_desc_offset
;
1372 unsigned vs_const_desc
= si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX
);
1373 unsigned vs_sampler_desc
= si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX
);
1374 uint64_t vs_const_desc_va
= sctx
->descriptors
[vs_const_desc
].gpu_address
;
1375 uint64_t vs_sampler_desc_va
= sctx
->descriptors
[vs_sampler_desc
].gpu_address
;
1376 uint64_t vb_desc_va
= sctx
->vb_descriptors_buffer
?
1377 sctx
->vb_descriptors_buffer
->gpu_address
+
1378 sctx
->vb_descriptors_offset
: 0;
1379 unsigned gds_offset
, gds_size
;
1380 struct si_fast_udiv_info32 num_prims_udiv
= {};
1382 if (info
->instance_count
> 1)
1383 num_prims_udiv
= si_compute_fast_udiv_info32(num_prims_per_instance
, 31);
1385 /* Limitations on how these two are packed in the user SGPR. */
1386 assert(num_prims_udiv
.post_shift
< 32);
1387 assert(num_prims_per_instance
< 1 << 27);
1389 si_resource_reference(&indexbuf_desc
, NULL
);
1391 bool primitive_restart
= sctx
->cs_prim_discard_state
.current
->key
.opt
.cs_primitive_restart
;
1393 if (VERTEX_COUNTER_GDS_MODE
== 1) {
1394 gds_offset
= sctx
->compute_gds_offset
;
1395 gds_size
= primitive_restart
? 8 : 4;
1396 sctx
->compute_gds_offset
+= gds_size
;
1398 /* Reset the counters in GDS for the first dispatch using WRITE_DATA.
1399 * The remainder of the GDS will be cleared after the dispatch packet
1400 * in parallel with compute shaders.
1402 if (first_dispatch
) {
1403 radeon_emit(cs
, PKT3(PKT3_WRITE_DATA
, 2 + gds_size
/4, 0));
1404 radeon_emit(cs
, S_370_DST_SEL(V_370_GDS
) | S_370_WR_CONFIRM(1));
1405 radeon_emit(cs
, gds_offset
);
1407 radeon_emit(cs
, 0); /* value to write */
1413 /* Set shader registers. */
1414 struct si_shader
*shader
= sctx
->cs_prim_discard_state
.current
;
1416 if (shader
!= sctx
->compute_ib_last_shader
) {
1417 radeon_add_to_buffer_list(sctx
, gfx_cs
, shader
->bo
, RADEON_USAGE_READ
,
1418 RADEON_PRIO_SHADER_BINARY
);
1419 uint64_t shader_va
= shader
->bo
->gpu_address
;
1421 assert(shader
->config
.scratch_bytes_per_wave
== 0);
1422 assert(shader
->config
.num_vgprs
* WAVES_PER_TG
<= 256 * 4);
1424 radeon_set_sh_reg_seq(cs
, R_00B830_COMPUTE_PGM_LO
, 2);
1425 radeon_emit(cs
, shader_va
>> 8);
1426 radeon_emit(cs
, S_00B834_DATA(shader_va
>> 40));
1428 radeon_set_sh_reg_seq(cs
, R_00B848_COMPUTE_PGM_RSRC1
, 2);
1429 radeon_emit(cs
, S_00B848_VGPRS((shader
->config
.num_vgprs
- 1) / 4) |
1430 S_00B848_SGPRS((shader
->config
.num_sgprs
- 1) / 8) |
1431 S_00B848_FLOAT_MODE(shader
->config
.float_mode
) |
1432 S_00B848_DX10_CLAMP(1));
1433 radeon_emit(cs
, S_00B84C_SCRATCH_EN(0 /* no scratch */) |
1434 S_00B84C_USER_SGPR(user_sgprs
) |
1435 S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
1436 S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE
== 2 /* need the wave ID */) |
1437 S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
1438 S_00B84C_LDS_SIZE(shader
->config
.lds_size
));
1440 radeon_set_sh_reg(cs
, R_00B854_COMPUTE_RESOURCE_LIMITS
,
1441 ac_get_compute_resource_limits(&sctx
->screen
->info
,
1444 THREADGROUPS_PER_CU
));
1445 sctx
->compute_ib_last_shader
= shader
;
1448 STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL
% THREADGROUP_SIZE
== 0);
1450 /* Big draw calls are split into smaller dispatches and draw packets. */
1451 for (unsigned start_prim
= 0; start_prim
< num_prims
; start_prim
+= SPLIT_PRIMS_PACKET_LEVEL
) {
1452 unsigned num_subdraw_prims
;
1454 if (start_prim
+ SPLIT_PRIMS_PACKET_LEVEL
< num_prims
)
1455 num_subdraw_prims
= SPLIT_PRIMS_PACKET_LEVEL
;
1457 num_subdraw_prims
= num_prims
- start_prim
;
1459 /* Small dispatches are executed back to back until a specific primitive
1460 * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
1461 * to start drawing the batch. This batching adds latency to the gfx IB,
1462 * but CS_DONE and REWIND are too slow.
1464 if (sctx
->compute_num_prims_in_batch
+ num_subdraw_prims
> PRIMS_PER_BATCH
)
1465 si_compute_signal_gfx(sctx
);
1467 if (sctx
->compute_num_prims_in_batch
== 0) {
1468 assert((gfx_cs
->gpu_address
>> 32) == sctx
->screen
->info
.address32_hi
);
1469 sctx
->compute_rewind_va
= gfx_cs
->gpu_address
+ (gfx_cs
->current
.cdw
+ 1) * 4;
1471 if (sctx
->chip_class
<= GFX7
|| FORCE_REWIND_EMULATION
) {
1472 radeon_emit(gfx_cs
, PKT3(PKT3_NOP
, 0, 0));
1473 radeon_emit(gfx_cs
, 0);
1475 si_cp_wait_mem(sctx
, gfx_cs
,
1476 sctx
->compute_rewind_va
|
1477 (uint64_t)sctx
->screen
->info
.address32_hi
<< 32,
1478 REWIND_SIGNAL_BIT
, REWIND_SIGNAL_BIT
,
1479 WAIT_REG_MEM_EQUAL
| WAIT_REG_MEM_PFP
);
1481 /* Use INDIRECT_BUFFER to chain to a different buffer
1482 * to discard the CP prefetch cache.
1484 sctx
->ws
->cs_check_space(gfx_cs
, 0, true);
1486 radeon_emit(gfx_cs
, PKT3(PKT3_REWIND
, 0, 0));
1487 radeon_emit(gfx_cs
, 0);
1491 sctx
->compute_num_prims_in_batch
+= num_subdraw_prims
;
1493 uint32_t count_va
= gfx_cs
->gpu_address
+ (gfx_cs
->current
.cdw
+ 4) * 4;
1494 uint64_t index_va
= out_indexbuf_va
+ start_prim
* 12;
1496 /* Emit the draw packet into the gfx IB. */
1497 radeon_emit(gfx_cs
, PKT3(PKT3_DRAW_INDEX_2
, 4, 0));
1498 radeon_emit(gfx_cs
, num_prims
* vertices_per_prim
);
1499 radeon_emit(gfx_cs
, index_va
);
1500 radeon_emit(gfx_cs
, index_va
>> 32);
1501 radeon_emit(gfx_cs
, 0);
1502 radeon_emit(gfx_cs
, V_0287F0_DI_SRC_SEL_DMA
);
1504 /* Continue with the compute IB. */
1505 if (start_prim
== 0) {
1506 uint32_t gds_prim_restart_continue_bit
= 0;
1508 if (sctx
->preserve_prim_restart_gds_at_flush
) {
1509 assert(primitive_restart
&&
1510 info
->mode
== PIPE_PRIM_TRIANGLE_STRIP
);
1511 assert(start_prim
< 1 << 31);
1512 gds_prim_restart_continue_bit
= 1 << 31;
1515 radeon_set_sh_reg_seq(cs
, R_00B900_COMPUTE_USER_DATA_0
, user_sgprs
);
1516 radeon_emit(cs
, index_buffers_va
);
1518 VERTEX_COUNTER_GDS_MODE
== 0 ? count_va
:
1519 VERTEX_COUNTER_GDS_MODE
== 1 ? gds_offset
:
1521 gds_prim_restart_continue_bit
);
1522 radeon_emit(cs
, start_prim
+ num_subdraw_prims
- 1);
1523 radeon_emit(cs
, count_va
);
1524 radeon_emit(cs
, vb_desc_va
);
1525 radeon_emit(cs
, vs_const_desc_va
);
1526 radeon_emit(cs
, vs_sampler_desc_va
);
1527 radeon_emit(cs
, base_vertex
);
1528 radeon_emit(cs
, info
->start_instance
);
1529 radeon_emit(cs
, num_prims_udiv
.multiplier
);
1530 radeon_emit(cs
, num_prims_udiv
.post_shift
|
1531 (num_prims_per_instance
<< 5));
1532 radeon_emit(cs
, info
->restart_index
);
1533 /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
1534 radeon_emit(cs
, fui(small_prim_cull_precision
));
1536 assert(VERTEX_COUNTER_GDS_MODE
== 2);
1537 /* Only update the SGPRs that changed. */
1538 radeon_set_sh_reg_seq(cs
, R_00B904_COMPUTE_USER_DATA_1
, 3);
1539 radeon_emit(cs
, start_prim
);
1540 radeon_emit(cs
, start_prim
+ num_subdraw_prims
- 1);
1541 radeon_emit(cs
, count_va
);
1544 /* Set grid dimensions. */
1545 unsigned start_block
= start_prim
/ THREADGROUP_SIZE
;
1546 unsigned num_full_blocks
= num_subdraw_prims
/ THREADGROUP_SIZE
;
1547 unsigned partial_block_size
= num_subdraw_prims
% THREADGROUP_SIZE
;
1549 radeon_set_sh_reg(cs
, R_00B810_COMPUTE_START_X
, start_block
);
1550 radeon_set_sh_reg(cs
, R_00B81C_COMPUTE_NUM_THREAD_X
,
1551 S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE
) |
1552 S_00B81C_NUM_THREAD_PARTIAL(partial_block_size
));
1554 radeon_emit(cs
, PKT3(PKT3_DISPATCH_DIRECT
, 3, 0) |
1555 PKT3_SHADER_TYPE_S(1));
1556 radeon_emit(cs
, start_block
+ num_full_blocks
+ !!partial_block_size
);
1559 radeon_emit(cs
, S_00B800_COMPUTE_SHADER_EN(1) |
1560 S_00B800_PARTIAL_TG_EN(!!partial_block_size
) |
1561 S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE
== 2) |
1562 S_00B800_ORDER_MODE(0 /* launch in order */));
1564 /* This is only for unordered append. Ordered append writes this from
1567 * Note that EOP and EOS events are super slow, so emulating the event
1568 * in a shader is an important optimization.
1570 if (VERTEX_COUNTER_GDS_MODE
== 1) {
1571 si_cp_release_mem(sctx
, cs
, V_028A90_CS_DONE
, 0,
1572 sctx
->chip_class
<= GFX8
? EOP_DST_SEL_MEM
: EOP_DST_SEL_TC_L2
,
1576 count_va
| ((uint64_t)sctx
->screen
->info
.address32_hi
<< 32),
1577 EOP_DATA_GDS(gds_offset
/ 4, 1),
1580 /* Now that compute shaders are running, clear the remainder of GDS. */
1581 if (first_dispatch
) {
1582 unsigned offset
= gds_offset
+ gds_size
;
1583 si_cp_dma_clear_buffer(sctx
, cs
, NULL
, offset
,
1584 GDS_SIZE_UNORDERED
- offset
,
1586 SI_CPDMA_SKIP_CHECK_CS_SPACE
|
1587 SI_CPDMA_SKIP_GFX_SYNC
|
1588 SI_CPDMA_SKIP_SYNC_BEFORE
,
1589 SI_COHERENCY_NONE
, L2_BYPASS
);
1592 first_dispatch
= false;
1594 assert(cs
->current
.cdw
<= cs
->current
.max_dw
);
1595 assert(gfx_cs
->current
.cdw
<= gfx_cs
->current
.max_dw
);