radeonsi: initialize the per-context compiler on demand
[mesa.git] / src / gallium / drivers / radeonsi / si_compute_prim_discard.c
1 /*
2 * Copyright 2019 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 */
25
26 #include "si_pipe.h"
27 #include "si_shader_internal.h"
28 #include "sid.h"
29 #include "si_build_pm4.h"
30 #include "ac_llvm_cull.h"
31
32 #include "util/u_prim.h"
33 #include "util/u_suballoc.h"
34 #include "util/u_upload_mgr.h"
35 #include "util/fast_idiv_by_const.h"
36
37 /* Based on:
38 * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
39 */
40
41 /* This file implements primitive culling using asynchronous compute.
42 * It's written to be GL conformant.
43 *
44 * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it
45 * in a compute shader. The shader processes 1 primitive/thread by invoking
46 * the VS for each vertex to get the positions, decomposes strips and fans
47 * into triangles (if needed), eliminates primitive restart (if needed),
48 * does (W<0) culling, face culling, view XY culling, zero-area and
49 * small-primitive culling, and generates a new index buffer that doesn't
50 * contain culled primitives.
51 *
52 * The index buffer is generated using the Ordered Count feature of GDS,
53 * which is an atomic counter that is incremented in the wavefront launch
54 * order, so that the original primitive order is preserved.
55 *
56 * Another GDS ordered counter is used to eliminate primitive restart indices.
57 * If a restart index lands on an even thread ID, the compute shader has to flip
58 * the primitive orientation of the whole following triangle strip. The primitive
59 * orientation has to be correct after strip and fan decomposition for two-sided
60 * shading to behave correctly. The decomposition also needs to be aware of
61 * which vertex is the provoking vertex for flat shading to behave correctly.
62 *
63 * IB = a GPU command buffer
64 *
65 * Both the compute and gfx IBs run in parallel sort of like CE and DE.
66 * The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND
67 * doesn't continue if its word isn't 0x80000000. Once compute shaders are
68 * finished culling, the last wave will write the final primitive count from
69 * GDS directly into the count word of the draw packet in the gfx IB, and
70 * a CS_DONE event will signal the REWIND packet to continue. It's really
71 * a direct draw with command buffer patching from the compute queue.
72 *
73 * The compute IB doesn't have to start when its corresponding gfx IB starts,
74 * but can start sooner. The compute IB is signaled to start after the last
75 * execution barrier in the *previous* gfx IB. This is handled as follows.
76 * The kernel GPU scheduler starts the compute IB after the previous gfx IB has
77 * started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that
78 * represents the barrier in the previous gfx IB.
79 *
80 * Features:
81 * - Triangle strips and fans are decomposed into an indexed triangle list.
82 * The decomposition differs based on the provoking vertex state.
83 * - Instanced draws are converted into non-instanced draws for 16-bit indices.
84 * (InstanceID is stored in the high bits of VertexID and unpacked by VS)
85 * - Primitive restart is fully supported with triangle strips, including
86 * correct primitive orientation across multiple waves. (restart indices
87 * reset primitive orientation)
88 * - W<0 culling (W<0 is behind the viewer, sort of like near Z culling).
89 * - Back face culling, incl. culling zero-area / degenerate primitives.
90 * - View XY culling.
91 * - View Z culling (disabled due to limited impact with perspective projection).
92 * - Small primitive culling for all MSAA modes and all quant modes.
93 *
94 * The following are not implemented:
95 * - ClipVertex/ClipDistance/CullDistance-based culling.
96 * - Scissor culling.
97 * - HiZ culling.
98 *
99 * Limitations (and unimplemented features that may be possible to implement):
100 * - Only triangles, triangle strips, and triangle fans are supported.
101 * - Primitive restart is only supported with triangle strips.
102 * - Instancing and primitive restart can't be used together.
103 * - Instancing is only supported with 16-bit indices and instance count <= 2^16.
104 * - The instance divisor buffer is unavailable, so all divisors must be
105 * either 0 or 1.
106 * - Multidraws where the vertex shader reads gl_DrawID are unsupported.
107 * - No support for tessellation and geometry shaders.
108 * (patch elimination where tess factors are 0 would be possible to implement)
109 * - The vertex shader must not contain memory stores.
110 * - All VS resources must not have a write usage in the command buffer.
111 * (TODO: all shader buffers currently set the write usage)
112 * - Bindless textures and images must not occur in the vertex shader.
113 *
114 * User data SGPR layout:
115 * INDEX_BUFFERS: pointer to constants
116 * 0..3: input index buffer - typed buffer view
117 * 4..7: output index buffer - typed buffer view
118 * 8..11: viewport state - scale.xy, translate.xy
119 * VERTEX_COUNTER: counter address or first primitive ID
120 * - If unordered memory counter: address of "count" in the draw packet
121 * and is incremented atomically by the shader.
122 * - If unordered GDS counter: address of "count" in GDS starting from 0,
123 * must be initialized to 0 before the dispatch.
124 * - If ordered GDS counter: the primitive ID that should reset the vertex
125 * counter to 0 in GDS
126 * LAST_WAVE_PRIM_ID: the primitive ID that should write the final vertex
127 * count to memory if using GDS ordered append
128 * VERTEX_COUNT_ADDR: where the last wave should write the vertex count if
129 * using GDS ordered append
130 * VS.VERTEX_BUFFERS: same value as VS
131 * VS.CONST_AND_SHADER_BUFFERS: same value as VS
132 * VS.SAMPLERS_AND_IMAGES: same value as VS
133 * VS.BASE_VERTEX: same value as VS
134 * VS.START_INSTANCE: same value as VS
135 * NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives
136 * per instance for instancing.
137 * NUM_PRIMS_UDIV_TERMS:
138 * - Bits [0:4]: "post_shift" for fast 31-bit division for instancing.
139 * - Bits [5:31]: The number of primitives per instance for computing the remainder.
140 * PRIMITIVE_RESTART_INDEX
141 * SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
142 *
143 *
144 * The code contains 3 codepaths:
145 * - Unordered memory counter (for debugging, random primitive order, no primitive restart)
146 * - Unordered GDS counter (for debugging, random primitive order, no primitive restart)
147 * - Ordered GDS counter (it preserves the primitive order)
148 *
149 * How to test primitive restart (the most complicated part because it needs
150 * to get the primitive orientation right):
151 * Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave
152 * primitive orientation flips with small draw calls, which is what most tests use.
153 * You can also enable draw call splitting into draw calls with just 2 primitives.
154 */
155
156 /* At least 256 is needed for the fastest wave launch rate from compute queues
157 * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
158 #define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */
159 #define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */
160 #define MAX_WAVES_PER_SH 0 /* no limit */
161 #define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */
162 /* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */
163 #define CULL_Z 0
164 /* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */
165 #define VERTEX_COUNTER_GDS_MODE 2
166 #define GDS_SIZE_UNORDERED (4 * 1024) /* only for the unordered GDS counter */
167
168 /* Grouping compute dispatches for small draw calls: How many primitives from multiple
169 * draw calls to process by compute before signaling the gfx IB. This reduces the number
170 * of EOP events + REWIND packets, because they decrease performance. */
171 #define PRIMS_PER_BATCH (512 * 1024)
172 /* Draw call splitting at the packet level. This allows signaling the gfx IB
173 * for big draw calls sooner, but doesn't allow context flushes between packets.
174 * Primitive restart is supported. Only implemented for ordered append. */
175 #define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH
176 /* If there is not enough ring buffer space for the current IB, split draw calls into
177 * this number of primitives, so that we can flush the context and get free ring space. */
178 #define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH
179
180 /* Derived values. */
181 #define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
182 #define SPLIT_PRIMS_PACKET_LEVEL (VERTEX_COUNTER_GDS_MODE == 2 ? \
183 SPLIT_PRIMS_PACKET_LEVEL_VALUE : \
184 UINT_MAX & ~(THREADGROUP_SIZE - 1))
185
186 #define REWIND_SIGNAL_BIT 0x80000000
187 /* For emulating the rewind packet on CI. */
188 #define FORCE_REWIND_EMULATION 0
189
190 void si_initialize_prim_discard_tunables(struct si_context *sctx)
191 {
192 sctx->prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
193
194 if (sctx->chip_class == GFX6 || /* SI support is not implemented */
195 !sctx->screen->info.has_gds_ordered_append ||
196 sctx->screen->debug_flags & DBG(NO_PD) ||
197 /* If aux_context == NULL, we are initializing aux_context right now. */
198 !sctx->screen->aux_context)
199 return;
200
201 /* TODO: enable this after the GDS kernel memory management is fixed */
202 bool enable_on_pro_graphics_by_default = false;
203
204 if (sctx->screen->debug_flags & DBG(ALWAYS_PD) ||
205 sctx->screen->debug_flags & DBG(PD) ||
206 (enable_on_pro_graphics_by_default &&
207 sctx->screen->info.is_pro_graphics &&
208 (sctx->family == CHIP_BONAIRE ||
209 sctx->family == CHIP_HAWAII ||
210 sctx->family == CHIP_TONGA ||
211 sctx->family == CHIP_FIJI ||
212 sctx->family == CHIP_POLARIS10 ||
213 sctx->family == CHIP_POLARIS11 ||
214 sctx->family == CHIP_VEGA10 ||
215 sctx->family == CHIP_VEGA20))) {
216 sctx->prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
217
218 if (sctx->screen->debug_flags & DBG(ALWAYS_PD))
219 sctx->prim_discard_vertex_count_threshold = 0; /* always enable */
220
221 const uint32_t MB = 1024 * 1024;
222 const uint64_t GB = 1024 * 1024 * 1024;
223
224 /* The total size is double this per context.
225 * Greater numbers allow bigger gfx IBs.
226 */
227 if (sctx->screen->info.vram_size <= 2 * GB)
228 sctx->index_ring_size_per_ib = 64 * MB;
229 else if (sctx->screen->info.vram_size <= 4 * GB)
230 sctx->index_ring_size_per_ib = 128 * MB;
231 else
232 sctx->index_ring_size_per_ib = 256 * MB;
233 }
234 }
235
236 /* Opcode can be "add" or "swap". */
237 static LLVMValueRef
238 si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
239 LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index,
240 bool release, bool done)
241 {
242 LLVMValueRef args[] = {
243 LLVMBuildIntToPtr(ctx->ac.builder, m0,
244 LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""),
245 value,
246 LLVMConstInt(ctx->i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
247 ctx->i32_0, /* scope */
248 ctx->i1false, /* volatile */
249 LLVMConstInt(ctx->i32, ordered_count_index, 0),
250 LLVMConstInt(ctx->i1, release, 0),
251 LLVMConstInt(ctx->i1, done, 0),
252 };
253
254 char intrinsic[64];
255 snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
256 return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->i32, args, ARRAY_SIZE(args), 0);
257 }
258
259 static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
260 {
261 uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
262 ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->i64, "");
263 ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->i64, hi, 0), "");
264 return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
265 LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GLOBAL), "");
266 }
267
268 struct si_thread0_section {
269 struct si_shader_context *ctx;
270 LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
271 LLVMValueRef saved_exec;
272 };
273
274 /* Enter a section that only executes on thread 0. */
275 static void si_enter_thread0_section(struct si_shader_context *ctx,
276 struct si_thread0_section *section,
277 LLVMValueRef thread_id)
278 {
279 section->ctx = ctx;
280 section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->i32, "result0");
281
282 /* This IF has 4 instructions:
283 * v_and_b32_e32 v, 63, v ; get the thread ID
284 * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0
285 * s_and_saveexec_b64 s, vcc
286 * s_cbranch_execz BB0_4
287 *
288 * It could just be s_and_saveexec_b64 s, 1.
289 */
290 ac_build_ifcc(&ctx->ac,
291 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id,
292 ctx->i32_0, ""), 12601);
293 }
294
295 /* Exit a section that only executes on thread 0 and broadcast the result
296 * to all threads. */
297 static void si_exit_thread0_section(struct si_thread0_section *section,
298 LLVMValueRef *result)
299 {
300 struct si_shader_context *ctx = section->ctx;
301
302 LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
303
304 ac_build_endif(&ctx->ac, 12601);
305
306 /* Broadcast the result from thread 0 to all threads. */
307 *result = ac_build_readlane(&ctx->ac,
308 LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
309 }
310
311 void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
312 {
313 struct si_shader_key *key = &ctx->shader->key;
314 LLVMBuilderRef builder = ctx->ac.builder;
315 LLVMValueRef vs = ctx->main_fn;
316
317 /* Always inline the VS function. */
318 ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
319 LLVMSetLinkage(vs, LLVMPrivateLinkage);
320
321 enum ac_arg_type const_desc_type;
322 if (ctx->shader->selector->info.const_buffers_declared == 1 &&
323 ctx->shader->selector->info.shader_buffers_declared == 0)
324 const_desc_type = AC_ARG_CONST_FLOAT_PTR;
325 else
326 const_desc_type = AC_ARG_CONST_DESC_PTR;
327
328 memset(&ctx->args, 0, sizeof(ctx->args));
329
330 struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
331 struct ac_arg param_vb_desc, param_const_desc;
332 struct ac_arg param_base_vertex, param_start_instance;
333 struct ac_arg param_block_id, param_local_id, param_ordered_wave_id;
334 struct ac_arg param_restart_index, param_smallprim_precision;
335 struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
336 struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr;
337
338 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
339 &param_index_buffers_and_constants);
340 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_counter);
341 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_last_wave_prim_id);
342 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_count_addr);
343 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
344 &param_vb_desc);
345 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type,
346 &param_const_desc);
347 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
348 &param_sampler_desc);
349 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_base_vertex);
350 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_instance);
351 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_multiplier);
352 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_terms);
353 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_restart_index);
354 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &param_smallprim_precision);
355
356 /* Block ID and thread ID inputs. */
357 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_block_id);
358 if (VERTEX_COUNTER_GDS_MODE == 2)
359 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_ordered_wave_id);
360 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &param_local_id);
361
362 /* Create the compute shader function. */
363 unsigned old_type = ctx->type;
364 ctx->type = PIPE_SHADER_COMPUTE;
365 si_create_function(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
366 ctx->type = old_type;
367
368 if (VERTEX_COUNTER_GDS_MODE == 1) {
369 ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size",
370 GDS_SIZE_UNORDERED);
371 }
372
373 /* Assemble parameters for VS. */
374 LLVMValueRef vs_params[16];
375 unsigned num_vs_params = 0;
376 unsigned param_vertex_id, param_instance_id;
377
378 vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
379 vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
380 vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
381 vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
382 vs_params[num_vs_params++] = LLVMConstInt(ctx->i32,
383 S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
384 vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
385 vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
386 vs_params[num_vs_params++] = ctx->i32_0; /* DrawID */
387 vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
388
389 vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
390 vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
391 vs_params[num_vs_params++] = ctx->i32_0; /* unused (PrimID) */
392 vs_params[num_vs_params++] = ctx->i32_0; /* unused */
393
394 assert(num_vs_params <= ARRAY_SIZE(vs_params));
395 assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
396
397 /* Load descriptors. (load 8 dwords at once) */
398 LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
399
400 LLVMValueRef index_buffers_and_constants = ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
401 tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
402 ac_array_in_const32_addr_space(ctx->v8i32), "");
403 tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->i32_0);
404
405 for (unsigned i = 0; i < 8; i++)
406 desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
407
408 input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
409 output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
410
411 /* Compute PrimID and InstanceID. */
412 LLVMValueRef global_thread_id =
413 ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
414 LLVMConstInt(ctx->i32, THREADGROUP_SIZE, 0),
415 ac_get_arg(&ctx->ac, param_local_id));
416 LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
417 LLVMValueRef instance_id = ctx->i32_0;
418
419 if (key->opt.cs_instancing) {
420 LLVMValueRef num_prims_udiv_terms =
421 ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
422 LLVMValueRef num_prims_udiv_multiplier =
423 ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
424 /* Unpack num_prims_udiv_terms. */
425 LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms,
426 LLVMConstInt(ctx->i32, 0x1f, 0), "");
427 LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms,
428 LLVMConstInt(ctx->i32, 5, 0), "");
429 /* Divide the total prim_id by the number of prims per instance. */
430 instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id,
431 num_prims_udiv_multiplier,
432 post_shift);
433 /* Compute the remainder. */
434 prim_id = LLVMBuildSub(builder, prim_id,
435 LLVMBuildMul(builder, instance_id,
436 prims_per_instance, ""), "");
437 }
438
439 /* Generate indices (like a non-indexed draw call). */
440 LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->i32)};
441 unsigned vertices_per_prim = 3;
442
443 switch (key->opt.cs_prim_type) {
444 case PIPE_PRIM_TRIANGLES:
445 for (unsigned i = 0; i < 3; i++) {
446 index[i] = ac_build_imad(&ctx->ac, prim_id,
447 LLVMConstInt(ctx->i32, 3, 0),
448 LLVMConstInt(ctx->i32, i, 0));
449 }
450 break;
451 case PIPE_PRIM_TRIANGLE_STRIP:
452 for (unsigned i = 0; i < 3; i++) {
453 index[i] = LLVMBuildAdd(builder, prim_id,
454 LLVMConstInt(ctx->i32, i, 0), "");
455 }
456 break;
457 case PIPE_PRIM_TRIANGLE_FAN:
458 /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
459 * and rasterizer as a normal triangle, so we need to put the provoking
460 * vertex into the correct index variable and preserve orientation at the same time.
461 * gl_VertexID is preserved, because it's equal to the index.
462 */
463 if (key->opt.cs_provoking_vertex_first) {
464 index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), "");
465 index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), "");
466 index[2] = ctx->i32_0;
467 } else {
468 index[0] = ctx->i32_0;
469 index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), "");
470 index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), "");
471 }
472 break;
473 default:
474 unreachable("unexpected primitive type");
475 }
476
477 /* Fetch indices. */
478 if (key->opt.cs_indexed) {
479 for (unsigned i = 0; i < 3; i++) {
480 index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf,
481 index[i], ctx->i32_0, 1,
482 0, true);
483 index[i] = ac_to_integer(&ctx->ac, index[i]);
484 }
485 }
486
487 LLVMValueRef ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id);
488
489 /* Extract the ordered wave ID. */
490 if (VERTEX_COUNTER_GDS_MODE == 2) {
491 ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id,
492 LLVMConstInt(ctx->i32, 6, 0), "");
493 ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id,
494 LLVMConstInt(ctx->i32, 0xfff, 0), "");
495 }
496 LLVMValueRef thread_id =
497 LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
498 LLVMConstInt(ctx->i32, 63, 0), "");
499
500 /* Every other triangle in a strip has a reversed vertex order, so we
501 * need to swap vertices of odd primitives to get the correct primitive
502 * orientation when converting triangle strips to triangles. Primitive
503 * restart complicates it, because a strip can start anywhere.
504 */
505 LLVMValueRef prim_restart_accepted = ctx->i1true;
506 LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
507
508 if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
509 /* Without primitive restart, odd primitives have reversed orientation.
510 * Only primitive restart can flip it with respect to the first vertex
511 * of the draw call.
512 */
513 LLVMValueRef first_is_odd = ctx->i1false;
514
515 /* Handle primitive restart. */
516 if (key->opt.cs_primitive_restart) {
517 /* Get the GDS primitive restart continue flag and clear
518 * the flag in vertex_counter. This flag is used when the draw
519 * call was split and we need to load the primitive orientation
520 * flag from GDS for the first wave too.
521 */
522 LLVMValueRef gds_prim_restart_continue =
523 LLVMBuildLShr(builder, vertex_counter,
524 LLVMConstInt(ctx->i32, 31, 0), "");
525 gds_prim_restart_continue =
526 LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->i1, "");
527 vertex_counter = LLVMBuildAnd(builder, vertex_counter,
528 LLVMConstInt(ctx->i32, 0x7fffffff, 0), "");
529
530 LLVMValueRef index0_is_reset;
531
532 for (unsigned i = 0; i < 3; i++) {
533 LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
534 ac_get_arg(&ctx->ac, param_restart_index),
535 "");
536 if (i == 0)
537 index0_is_reset = LLVMBuildNot(builder, not_reset, "");
538 prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted,
539 not_reset, "");
540 }
541
542 /* If the previous waves flip the primitive orientation
543 * of the current triangle strip, it will be stored in GDS.
544 *
545 * Sometimes the correct orientation is not needed, in which case
546 * we don't need to execute this.
547 */
548 if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
549 /* If there are reset indices in this wave, get the thread index
550 * where the most recent strip starts relative to each thread.
551 */
552 LLVMValueRef preceding_threads_mask =
553 LLVMBuildSub(builder,
554 LLVMBuildShl(builder, ctx->ac.i64_1,
555 LLVMBuildZExt(builder, thread_id, ctx->i64, ""), ""),
556 ctx->ac.i64_1, "");
557
558 LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
559 LLVMValueRef preceding_reset_threadmask =
560 LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
561 LLVMValueRef strip_start =
562 ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
563 strip_start = LLVMBuildAdd(builder, strip_start, ctx->i32_1, "");
564
565 /* This flips the orientatino based on reset indices within this wave only. */
566 first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->i1, "");
567
568 LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
569 LLVMValueRef is_first_wave, current_wave_resets_index;
570
571 /* Get the thread index where the last strip starts in this wave.
572 *
573 * If the last strip doesn't start in this wave, the thread index
574 * will be 0.
575 *
576 * If the last strip starts in the next wave, the thread index will
577 * be 64.
578 */
579 last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
580 last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->i32_1, "");
581
582 struct si_thread0_section section;
583 si_enter_thread0_section(ctx, &section, thread_id);
584
585 /* This must be done in the thread 0 section, because
586 * we expect PrimID to be 0 for the whole first wave
587 * in this expression.
588 *
589 * NOTE: This will need to be different if we wanna support
590 * instancing with primitive restart.
591 */
592 is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->i32_0, "");
593 is_first_wave = LLVMBuildAnd(builder, is_first_wave,
594 LLVMBuildNot(builder,
595 gds_prim_restart_continue, ""), "");
596 current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE,
597 last_strip_start, ctx->i32_0, "");
598
599 ret = ac_build_alloca_undef(&ctx->ac, ctx->i32, "prev_state");
600
601 /* Save the last strip start primitive index in GDS and read
602 * the value that previous waves stored.
603 *
604 * if (is_first_wave || current_wave_resets_strip)
605 * // Read the value that previous waves stored and store a new one.
606 * first_is_odd = ds.ordered.swap(last_strip_start);
607 * else
608 * // Just read the value that previous waves stored.
609 * first_is_odd = ds.ordered.add(0);
610 */
611 ac_build_ifcc(&ctx->ac,
612 LLVMBuildOr(builder, is_first_wave,
613 current_wave_resets_index, ""), 12602);
614 {
615 /* The GDS address is always 0 with ordered append. */
616 tmp = si_build_ds_ordered_op(ctx, "swap",
617 ordered_wave_id, last_strip_start,
618 1, true, false);
619 LLVMBuildStore(builder, tmp, ret);
620 }
621 ac_build_else(&ctx->ac, 12603);
622 {
623 /* Just read the value from GDS. */
624 tmp = si_build_ds_ordered_op(ctx, "add",
625 ordered_wave_id, ctx->i32_0,
626 1, true, false);
627 LLVMBuildStore(builder, tmp, ret);
628 }
629 ac_build_endif(&ctx->ac, 12602);
630
631 prev_wave_state = LLVMBuildLoad(builder, ret, "");
632 /* Ignore the return value if this is the first wave. */
633 prev_wave_state = LLVMBuildSelect(builder, is_first_wave,
634 ctx->i32_0, prev_wave_state, "");
635 si_exit_thread0_section(&section, &prev_wave_state);
636 prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->i1, "");
637
638 /* If the strip start appears to be on thread 0 for the current primitive
639 * (meaning the reset index is not present in this wave and might have
640 * appeared in previous waves), use the value from GDS to determine
641 * primitive orientation.
642 *
643 * If the strip start is in this wave for the current primitive, use
644 * the value from the current wave to determine primitive orientation.
645 */
646 LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ,
647 strip_start, ctx->i32_0, "");
648 first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state,
649 first_is_odd, "");
650 }
651 }
652 /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
653 LLVMValueRef prim_is_odd =
654 LLVMBuildXor(builder, first_is_odd,
655 LLVMBuildTrunc(builder, thread_id, ctx->i1, ""), "");
656
657 /* Determine the primitive orientation.
658 * Only swap the vertices that are not the provoking vertex. We need to keep
659 * the provoking vertex in place.
660 */
661 if (key->opt.cs_provoking_vertex_first) {
662 LLVMValueRef index1 = index[1];
663 LLVMValueRef index2 = index[2];
664 index[1] = LLVMBuildSelect(builder, prim_is_odd, index2, index1, "");
665 index[2] = LLVMBuildSelect(builder, prim_is_odd, index1, index2, "");
666 } else {
667 LLVMValueRef index0 = index[0];
668 LLVMValueRef index1 = index[1];
669 index[0] = LLVMBuildSelect(builder, prim_is_odd, index1, index0, "");
670 index[1] = LLVMBuildSelect(builder, prim_is_odd, index0, index1, "");
671 }
672 }
673
674 /* Execute the vertex shader for each vertex to get vertex positions. */
675 LLVMValueRef pos[3][4];
676 for (unsigned i = 0; i < vertices_per_prim; i++) {
677 vs_params[param_vertex_id] = index[i];
678 vs_params[param_instance_id] = instance_id;
679
680 LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
681 for (unsigned chan = 0; chan < 4; chan++)
682 pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
683 }
684
685 /* Divide XYZ by W. */
686 for (unsigned i = 0; i < vertices_per_prim; i++) {
687 for (unsigned chan = 0; chan < 3; chan++)
688 pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
689 }
690
691 /* Load the viewport state. */
692 LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
693 LLVMConstInt(ctx->i32, 2, 0));
694 vp = LLVMBuildBitCast(builder, vp, ctx->v4f32, "");
695 LLVMValueRef vp_scale[2], vp_translate[2];
696 vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
697 vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
698 vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
699 vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
700
701 /* Do culling. */
702 struct ac_cull_options options = {};
703 options.cull_front = key->opt.cs_cull_front;
704 options.cull_back = key->opt.cs_cull_back;
705 options.cull_view_xy = true;
706 options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
707 options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
708 options.cull_small_prims = true;
709 options.cull_zero_area = true;
710 options.cull_w = true;
711 options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
712
713 LLVMValueRef accepted =
714 ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted,
715 vp_scale, vp_translate,
716 ac_get_arg(&ctx->ac, param_smallprim_precision),
717 &options);
718
719 LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
720
721 /* Count the number of active threads by doing bitcount(accepted). */
722 LLVMValueRef num_prims_accepted =
723 ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->i64,
724 &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
725 num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->i32, "");
726
727 LLVMValueRef start;
728
729 /* Execute atomic_add on the vertex count. */
730 struct si_thread0_section section;
731 si_enter_thread0_section(ctx, &section, thread_id);
732 {
733 if (VERTEX_COUNTER_GDS_MODE == 0) {
734 LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
735 LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
736 vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
737 start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
738 vertex_counter, num_indices,
739 LLVMAtomicOrderingMonotonic, false);
740 } else if (VERTEX_COUNTER_GDS_MODE == 1) {
741 LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
742 LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
743 vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
744 LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), "");
745 start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
746 vertex_counter, num_indices,
747 LLVMAtomicOrderingMonotonic, false);
748 } else if (VERTEX_COUNTER_GDS_MODE == 2) {
749 LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
750
751 /* If the draw call was split into multiple subdraws, each using
752 * a separate draw packet, we need to start counting from 0 for
753 * the first compute wave of the subdraw.
754 *
755 * vertex_counter contains the primitive ID of the first thread
756 * in the first wave.
757 *
758 * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
759 */
760 LLVMValueRef is_first_wave =
761 LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
762 vertex_counter, "");
763
764 /* Store the primitive count for ordered append, not vertex count.
765 * The idea is to avoid GDS initialization via CP DMA. The shader
766 * effectively stores the first count using "swap".
767 *
768 * if (first_wave) {
769 * ds.ordered.swap(num_prims_accepted); // store the first primitive count
770 * previous = 0;
771 * } else {
772 * previous = ds.ordered.add(num_prims_accepted) // add the primitive count
773 * }
774 */
775 ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
776 {
777 /* The GDS address is always 0 with ordered append. */
778 si_build_ds_ordered_op(ctx, "swap", ordered_wave_id,
779 num_prims_accepted, 0, true, true);
780 LLVMBuildStore(builder, ctx->i32_0, tmp_store);
781 }
782 ac_build_else(&ctx->ac, 12605);
783 {
784 LLVMBuildStore(builder,
785 si_build_ds_ordered_op(ctx, "add", ordered_wave_id,
786 num_prims_accepted, 0,
787 true, true),
788 tmp_store);
789 }
790 ac_build_endif(&ctx->ac, 12604);
791
792 start = LLVMBuildLoad(builder, tmp_store, "");
793 }
794 }
795 si_exit_thread0_section(&section, &start);
796
797 /* Write the final vertex count to memory. An EOS/EOP event could do this,
798 * but those events are super slow and should be avoided if performance
799 * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
800 * event like this.
801 */
802 if (VERTEX_COUNTER_GDS_MODE == 2) {
803 ac_build_ifcc(&ctx->ac,
804 LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
805 ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""),
806 12606);
807 LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
808 count = LLVMBuildMul(builder, count,
809 LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
810
811 /* GFX8 needs to disable caching, so that the CP can see the stored value.
812 * MTYPE=3 bypasses TC L2.
813 */
814 if (ctx->screen->info.chip_class <= GFX8) {
815 LLVMValueRef desc[] = {
816 ac_get_arg(&ctx->ac, param_vertex_count_addr),
817 LLVMConstInt(ctx->i32,
818 S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
819 LLVMConstInt(ctx->i32, 4, 0),
820 LLVMConstInt(ctx->i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
821 S_008F0C_MTYPE(3 /* uncached */), 0),
822 };
823 LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
824 ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->i32_0,
825 ctx->i32_0, 0, ac_glc | ac_slc);
826 } else {
827 LLVMBuildStore(builder, count,
828 si_expand_32bit_pointer(ctx,
829 ac_get_arg(&ctx->ac,
830 param_vertex_count_addr)));
831 }
832 ac_build_endif(&ctx->ac, 12606);
833 } else {
834 /* For unordered modes that increment a vertex count instead of
835 * primitive count, convert it into the primitive index.
836 */
837 start = LLVMBuildUDiv(builder, start,
838 LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
839 }
840
841 /* Now we need to store the indices of accepted primitives into
842 * the output index buffer.
843 */
844 ac_build_ifcc(&ctx->ac, accepted, 16607);
845 {
846 /* Get the number of bits set before the index of this thread. */
847 LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
848
849 /* We have lowered instancing. Pack the instance ID into vertex ID. */
850 if (key->opt.cs_instancing) {
851 instance_id = LLVMBuildShl(builder, instance_id,
852 LLVMConstInt(ctx->i32, 16, 0), "");
853
854 for (unsigned i = 0; i < vertices_per_prim; i++)
855 index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
856 }
857
858 if (VERTEX_COUNTER_GDS_MODE == 2) {
859 /* vertex_counter contains the first primitive ID
860 * for this dispatch. If the draw call was split into
861 * multiple subdraws, the first primitive ID is > 0
862 * for subsequent subdraws. Each subdraw uses a different
863 * portion of the output index buffer. Offset the store
864 * vindex by the first primitive ID to get the correct
865 * store address for the subdraw.
866 */
867 start = LLVMBuildAdd(builder, start, vertex_counter, "");
868 }
869
870 /* Write indices for accepted primitives. */
871 LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
872 LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
873
874 if (!ac_has_vec3_support(ctx->ac.chip_class, true))
875 vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
876
877 ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata,
878 vindex, ctx->i32_0, 3,
879 ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
880 }
881 ac_build_endif(&ctx->ac, 16607);
882
883 LLVMBuildRetVoid(builder);
884 }
885
886 /* Return false if the shader isn't ready. */
887 static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
888 const struct pipe_draw_info *info,
889 bool primitive_restart)
890 {
891 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
892 struct si_shader_key key;
893
894 /* Primitive restart needs ordered counters. */
895 assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
896 assert(!primitive_restart || info->instance_count == 1);
897
898 memset(&key, 0, sizeof(key));
899 si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
900 assert(!key.part.vs.prolog.instance_divisor_is_fetched);
901
902 key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
903 key.opt.vs_as_prim_discard_cs = 1;
904 key.opt.cs_prim_type = info->mode;
905 key.opt.cs_indexed = info->index_size != 0;
906 key.opt.cs_instancing = info->instance_count > 1;
907 key.opt.cs_primitive_restart = primitive_restart;
908 key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
909
910 /* Primitive restart with triangle strips needs to preserve primitive
911 * orientation for cases where front and back primitive orientation matters.
912 */
913 if (primitive_restart) {
914 struct si_shader_selector *ps = sctx->ps_shader.cso;
915
916 key.opt.cs_need_correct_orientation =
917 rs->cull_front != rs->cull_back ||
918 ps->info.uses_frontface ||
919 (rs->two_side && ps->info.colors_read);
920 }
921
922 if (rs->rasterizer_discard) {
923 /* Just for performance testing and analysis of trivial bottlenecks.
924 * This should result in a very short compute shader. */
925 key.opt.cs_cull_front = 1;
926 key.opt.cs_cull_back = 1;
927 } else {
928 key.opt.cs_cull_front =
929 sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
930 key.opt.cs_cull_back =
931 sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
932 }
933
934 if (!rs->depth_clamp_any && CULL_Z) {
935 key.opt.cs_cull_z = 1;
936 key.opt.cs_halfz_clip_space = rs->clip_halfz;
937 }
938
939 sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
940 sctx->cs_prim_discard_state.current = NULL;
941
942 if (!sctx->compiler.passes)
943 si_init_compiler(sctx->screen, &sctx->compiler);
944
945 struct si_compiler_ctx_state compiler_state;
946 compiler_state.compiler = &sctx->compiler;
947 compiler_state.debug = sctx->debug;
948 compiler_state.is_debug_context = sctx->is_debug;
949
950 return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state,
951 &compiler_state, &key, -1, true) == 0 &&
952 /* Disallow compute shaders using the scratch buffer. */
953 sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
954 }
955
956 static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
957 {
958 if (sctx->index_ring)
959 return true;
960
961 if (!sctx->prim_discard_compute_cs) {
962 struct radeon_winsys *ws = sctx->ws;
963 unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED :
964 VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
965 unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
966
967 if (gds_size) {
968 sctx->gds = ws->buffer_create(ws, gds_size, 4,
969 RADEON_DOMAIN_GDS, 0);
970 if (!sctx->gds)
971 return false;
972
973 ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
974 RADEON_USAGE_READWRITE, 0, 0);
975 }
976 if (num_oa_counters) {
977 assert(gds_size);
978 sctx->gds_oa = ws->buffer_create(ws, num_oa_counters,
979 1, RADEON_DOMAIN_OA, 0);
980 if (!sctx->gds_oa)
981 return false;
982
983 ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
984 RADEON_USAGE_READWRITE, 0, 0);
985 }
986
987 sctx->prim_discard_compute_cs =
988 ws->cs_add_parallel_compute_ib(sctx->gfx_cs,
989 num_oa_counters > 0);
990 if (!sctx->prim_discard_compute_cs)
991 return false;
992 }
993
994 if (!sctx->index_ring) {
995 sctx->index_ring =
996 si_aligned_buffer_create(sctx->b.screen,
997 SI_RESOURCE_FLAG_UNMAPPABLE,
998 PIPE_USAGE_DEFAULT,
999 sctx->index_ring_size_per_ib * 2,
1000 sctx->screen->info.pte_fragment_size);
1001 if (!sctx->index_ring)
1002 return false;
1003 }
1004 return true;
1005 }
1006
1007 static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
1008 {
1009 return sctx->index_ring_offset +
1010 align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
1011 sctx->index_ring_size_per_ib;
1012 }
1013
1014 enum si_prim_discard_outcome
1015 si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
1016 const struct pipe_draw_info *info,
1017 bool primitive_restart)
1018 {
1019 /* If the compute shader compilation isn't finished, this returns false. */
1020 if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
1021 return SI_PRIM_DISCARD_DISABLED;
1022
1023 if (!si_initialize_prim_discard_cmdbuf(sctx))
1024 return SI_PRIM_DISCARD_DISABLED;
1025
1026 struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
1027 unsigned prim = info->mode;
1028 unsigned count = info->count;
1029 unsigned instance_count = info->instance_count;
1030 unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
1031 unsigned num_prims = num_prims_per_instance * instance_count;
1032 unsigned out_indexbuf_size = num_prims * 12;
1033 bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
1034 const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
1035
1036 /* Split draws at the draw call level if the ring is full. This makes
1037 * better use of the ring space.
1038 */
1039 if (ring_full &&
1040 num_prims > split_prims_draw_level &&
1041 instance_count == 1 && /* TODO: support splitting instanced draws */
1042 (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
1043 (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
1044 /* Split draws. */
1045 struct pipe_draw_info split_draw = *info;
1046 split_draw.primitive_restart = primitive_restart;
1047
1048 unsigned base_start = split_draw.start;
1049
1050 if (prim == PIPE_PRIM_TRIANGLES) {
1051 unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
1052 assert(vert_count_per_subdraw < count);
1053
1054 for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
1055 split_draw.start = base_start + start;
1056 split_draw.count = MIN2(count - start, vert_count_per_subdraw);
1057
1058 sctx->b.draw_vbo(&sctx->b, &split_draw);
1059 }
1060 } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
1061 /* No primitive pair can be split, because strips reverse orientation
1062 * for odd primitives. */
1063 STATIC_ASSERT(split_prims_draw_level % 2 == 0);
1064
1065 unsigned vert_count_per_subdraw = split_prims_draw_level;
1066
1067 for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
1068 split_draw.start = base_start + start;
1069 split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
1070
1071 sctx->b.draw_vbo(&sctx->b, &split_draw);
1072
1073 if (start == 0 &&
1074 primitive_restart &&
1075 sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
1076 sctx->preserve_prim_restart_gds_at_flush = true;
1077 }
1078 sctx->preserve_prim_restart_gds_at_flush = false;
1079 } else {
1080 assert(0);
1081 }
1082
1083 return SI_PRIM_DISCARD_DRAW_SPLIT;
1084 }
1085
1086 /* Just quit if the draw call doesn't fit into the ring and can't be split. */
1087 if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
1088 if (SI_PRIM_DISCARD_DEBUG)
1089 puts("PD failed: draw call too big, can't be split");
1090 return SI_PRIM_DISCARD_DISABLED;
1091 }
1092
1093 unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
1094 unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
1095 24 * (num_subdraws - 1) + /* subdraws */
1096 20; /* leave some space at the end */
1097 unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
1098
1099 if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
1100 need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
1101 else
1102 need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
1103
1104 if (ring_full ||
1105 (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
1106 !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
1107 /* If the current IB is empty but the size is too small, add a NOP
1108 * packet to force a flush and get a bigger IB.
1109 */
1110 if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
1111 gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
1112 radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
1113 radeon_emit(gfx_cs, 0);
1114 }
1115
1116 si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
1117 }
1118
1119 /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
1120 struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
1121 ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
1122 assert(compute_has_space);
1123 assert(si_check_ring_space(sctx, out_indexbuf_size));
1124 return SI_PRIM_DISCARD_ENABLED;
1125 }
1126
1127 void si_compute_signal_gfx(struct si_context *sctx)
1128 {
1129 struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
1130 unsigned writeback_L2_flags = 0;
1131
1132 /* The writeback L2 flags vary with each chip generation. */
1133 /* CI needs to flush vertex indices to memory. */
1134 if (sctx->chip_class <= GFX7)
1135 writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
1136 else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
1137 writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
1138
1139 if (!sctx->compute_num_prims_in_batch)
1140 return;
1141
1142 assert(sctx->compute_rewind_va);
1143
1144 /* After the queued dispatches are done and vertex counts are written to
1145 * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
1146 * the dispatches to finish, it only adds the CS_DONE event into the event
1147 * queue.
1148 */
1149 si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
1150 sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
1151 writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM :
1152 EOP_INT_SEL_NONE,
1153 EOP_DATA_SEL_VALUE_32BIT,
1154 NULL,
1155 sctx->compute_rewind_va |
1156 ((uint64_t)sctx->screen->info.address32_hi << 32),
1157 REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
1158 SI_NOT_QUERY);
1159
1160 sctx->compute_rewind_va = 0;
1161 sctx->compute_num_prims_in_batch = 0;
1162 }
1163
1164 /* Dispatch a primitive discard compute shader. */
1165 void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
1166 const struct pipe_draw_info *info,
1167 unsigned index_size,
1168 unsigned base_vertex,
1169 uint64_t input_indexbuf_va,
1170 unsigned input_indexbuf_num_elements)
1171 {
1172 struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
1173 struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
1174 unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
1175 if (!num_prims_per_instance)
1176 return;
1177
1178 unsigned num_prims = num_prims_per_instance * info->instance_count;
1179 unsigned vertices_per_prim, output_indexbuf_format;
1180
1181 switch (info->mode) {
1182 case PIPE_PRIM_TRIANGLES:
1183 case PIPE_PRIM_TRIANGLE_STRIP:
1184 case PIPE_PRIM_TRIANGLE_FAN:
1185 vertices_per_prim = 3;
1186 output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
1187 break;
1188 default:
1189 unreachable("unsupported primitive type");
1190 return;
1191 }
1192
1193 unsigned out_indexbuf_offset;
1194 uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
1195 bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
1196
1197 /* Initialize the compute IB if it's empty. */
1198 if (!sctx->prim_discard_compute_ib_initialized) {
1199 /* 1) State initialization. */
1200 sctx->compute_gds_offset = 0;
1201 sctx->compute_ib_last_shader = NULL;
1202
1203 if (sctx->last_ib_barrier_fence) {
1204 assert(!sctx->last_ib_barrier_buf);
1205 sctx->ws->cs_add_fence_dependency(gfx_cs,
1206 sctx->last_ib_barrier_fence,
1207 RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
1208 }
1209
1210 /* 2) IB initialization. */
1211
1212 /* This needs to be done at the beginning of IBs due to possible
1213 * TTM buffer moves in the kernel.
1214 *
1215 * TODO: update for GFX10
1216 */
1217 si_emit_surface_sync(sctx, cs,
1218 S_0085F0_TC_ACTION_ENA(1) |
1219 S_0085F0_TCL1_ACTION_ENA(1) |
1220 S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
1221 S_0085F0_SH_ICACHE_ACTION_ENA(1) |
1222 S_0085F0_SH_KCACHE_ACTION_ENA(1));
1223
1224 /* Restore the GDS prim restart counter if needed. */
1225 if (sctx->preserve_prim_restart_gds_at_flush) {
1226 si_cp_copy_data(sctx, cs,
1227 COPY_DATA_GDS, NULL, 4,
1228 COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4);
1229 }
1230
1231 si_emit_initial_compute_regs(sctx, cs);
1232
1233 radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
1234 S_00B860_WAVES(sctx->scratch_waves) |
1235 S_00B860_WAVESIZE(0)); /* no scratch */
1236
1237 /* Only 1D grids are launched. */
1238 radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
1239 radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) |
1240 S_00B820_NUM_THREAD_PARTIAL(1));
1241 radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) |
1242 S_00B824_NUM_THREAD_PARTIAL(1));
1243
1244 radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
1245 radeon_emit(cs, 0);
1246 radeon_emit(cs, 0);
1247
1248 /* Disable ordered alloc for OA resources. */
1249 for (unsigned i = 0; i < 2; i++) {
1250 radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
1251 radeon_emit(cs, S_031074_INDEX(i));
1252 radeon_emit(cs, 0);
1253 radeon_emit(cs, S_03107C_ENABLE(0));
1254 }
1255
1256 if (sctx->last_ib_barrier_buf) {
1257 assert(!sctx->last_ib_barrier_fence);
1258 radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf,
1259 RADEON_USAGE_READ, RADEON_PRIO_FENCE);
1260 si_cp_wait_mem(sctx, cs,
1261 sctx->last_ib_barrier_buf->gpu_address +
1262 sctx->last_ib_barrier_buf_offset, 1, 1,
1263 WAIT_REG_MEM_EQUAL);
1264 }
1265
1266 sctx->prim_discard_compute_ib_initialized = true;
1267 }
1268
1269 /* Allocate the output index buffer. */
1270 output_indexbuf_size = align(output_indexbuf_size,
1271 sctx->screen->info.tcc_cache_line_size);
1272 assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
1273 out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
1274 sctx->index_ring_offset += output_indexbuf_size;
1275
1276 radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
1277 RADEON_PRIO_SHADER_RW_BUFFER);
1278 uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
1279
1280 /* Prepare index buffer descriptors. */
1281 struct si_resource *indexbuf_desc = NULL;
1282 unsigned indexbuf_desc_offset;
1283 unsigned desc_size = 12 * 4;
1284 uint32_t *desc;
1285
1286 u_upload_alloc(sctx->b.const_uploader, 0, desc_size,
1287 si_optimal_tcc_alignment(sctx, desc_size),
1288 &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc,
1289 (void**)&desc);
1290 radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
1291 RADEON_PRIO_DESCRIPTORS);
1292
1293 /* Input index buffer. */
1294 desc[0] = input_indexbuf_va;
1295 desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) |
1296 S_008F04_STRIDE(index_size);
1297 desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
1298 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1299 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
1300 S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 :
1301 index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 :
1302 V_008F0C_BUF_DATA_FORMAT_32);
1303
1304 /* Output index buffer. */
1305 desc[4] = out_indexbuf_va;
1306 desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) |
1307 S_008F04_STRIDE(vertices_per_prim * 4);
1308 desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
1309 desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1310 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1311 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1312 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
1313 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
1314 S_008F0C_DATA_FORMAT(output_indexbuf_format);
1315
1316 /* Viewport state.
1317 * This is needed by the small primitive culling, because it's done
1318 * in screen space.
1319 */
1320 float scale[2], translate[2];
1321
1322 scale[0] = sctx->viewports.states[0].scale[0];
1323 scale[1] = sctx->viewports.states[0].scale[1];
1324 translate[0] = sctx->viewports.states[0].translate[0];
1325 translate[1] = sctx->viewports.states[0].translate[1];
1326
1327 /* The viewport shouldn't flip the X axis for the small prim culling to work. */
1328 assert(-scale[0] + translate[0] <= scale[0] + translate[0]);
1329
1330 /* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
1331 * This is because the viewport transformation inverts the clip space
1332 * bounding box, so min becomes max, which breaks small primitive
1333 * culling.
1334 */
1335 if (sctx->viewports.y_inverted) {
1336 scale[1] = -scale[1];
1337 translate[1] = -translate[1];
1338 }
1339
1340 /* Scale the framebuffer up, so that samples become pixels and small
1341 * primitive culling is the same for all sample counts.
1342 * This only works with the standard DX sample positions, because
1343 * the samples are evenly spaced on both X and Y axes.
1344 */
1345 unsigned num_samples = sctx->framebuffer.nr_samples;
1346 assert(num_samples >= 1);
1347
1348 for (unsigned i = 0; i < 2; i++) {
1349 scale[i] *= num_samples;
1350 translate[i] *= num_samples;
1351 }
1352
1353 desc[8] = fui(scale[0]);
1354 desc[9] = fui(scale[1]);
1355 desc[10] = fui(translate[0]);
1356 desc[11] = fui(translate[1]);
1357
1358 /* Better subpixel precision increases the efficiency of small
1359 * primitive culling. */
1360 unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
1361 float small_prim_cull_precision;
1362
1363 if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
1364 small_prim_cull_precision = num_samples / 4096.0;
1365 else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
1366 small_prim_cull_precision = num_samples / 1024.0;
1367 else
1368 small_prim_cull_precision = num_samples / 256.0;
1369
1370 /* Set user data SGPRs. */
1371 /* This can't be greater than 14 if we want the fastest launch rate. */
1372 unsigned user_sgprs = 13;
1373
1374 uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
1375 unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
1376 unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
1377 uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
1378 uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
1379 uint64_t vb_desc_va = sctx->vb_descriptors_buffer ?
1380 sctx->vb_descriptors_buffer->gpu_address +
1381 sctx->vb_descriptors_offset : 0;
1382 unsigned gds_offset, gds_size;
1383 struct si_fast_udiv_info32 num_prims_udiv = {};
1384
1385 if (info->instance_count > 1)
1386 num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
1387
1388 /* Limitations on how these two are packed in the user SGPR. */
1389 assert(num_prims_udiv.post_shift < 32);
1390 assert(num_prims_per_instance < 1 << 27);
1391
1392 si_resource_reference(&indexbuf_desc, NULL);
1393
1394 bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
1395
1396 if (VERTEX_COUNTER_GDS_MODE == 1) {
1397 gds_offset = sctx->compute_gds_offset;
1398 gds_size = primitive_restart ? 8 : 4;
1399 sctx->compute_gds_offset += gds_size;
1400
1401 /* Reset the counters in GDS for the first dispatch using WRITE_DATA.
1402 * The remainder of the GDS will be cleared after the dispatch packet
1403 * in parallel with compute shaders.
1404 */
1405 if (first_dispatch) {
1406 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0));
1407 radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
1408 radeon_emit(cs, gds_offset);
1409 radeon_emit(cs, 0);
1410 radeon_emit(cs, 0); /* value to write */
1411 if (gds_size == 8)
1412 radeon_emit(cs, 0);
1413 }
1414 }
1415
1416 /* Set shader registers. */
1417 struct si_shader *shader = sctx->cs_prim_discard_state.current;
1418
1419 if (shader != sctx->compute_ib_last_shader) {
1420 radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
1421 RADEON_PRIO_SHADER_BINARY);
1422 uint64_t shader_va = shader->bo->gpu_address;
1423
1424 assert(shader->config.scratch_bytes_per_wave == 0);
1425 assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
1426
1427 radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
1428 radeon_emit(cs, shader_va >> 8);
1429 radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
1430
1431 radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
1432 radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
1433 S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8) |
1434 S_00B848_FLOAT_MODE(shader->config.float_mode) |
1435 S_00B848_DX10_CLAMP(1));
1436 radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) |
1437 S_00B84C_USER_SGPR(user_sgprs) |
1438 S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
1439 S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
1440 S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
1441 S_00B84C_LDS_SIZE(shader->config.lds_size));
1442
1443 radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
1444 ac_get_compute_resource_limits(&sctx->screen->info,
1445 WAVES_PER_TG,
1446 MAX_WAVES_PER_SH,
1447 THREADGROUPS_PER_CU));
1448 sctx->compute_ib_last_shader = shader;
1449 }
1450
1451 STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
1452
1453 /* Big draw calls are split into smaller dispatches and draw packets. */
1454 for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
1455 unsigned num_subdraw_prims;
1456
1457 if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
1458 num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
1459 else
1460 num_subdraw_prims = num_prims - start_prim;
1461
1462 /* Small dispatches are executed back to back until a specific primitive
1463 * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
1464 * to start drawing the batch. This batching adds latency to the gfx IB,
1465 * but CS_DONE and REWIND are too slow.
1466 */
1467 if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
1468 si_compute_signal_gfx(sctx);
1469
1470 if (sctx->compute_num_prims_in_batch == 0) {
1471 assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
1472 sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
1473
1474 if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
1475 radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
1476 radeon_emit(gfx_cs, 0);
1477
1478 si_cp_wait_mem(sctx, gfx_cs,
1479 sctx->compute_rewind_va |
1480 (uint64_t)sctx->screen->info.address32_hi << 32,
1481 REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT,
1482 WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
1483
1484 /* Use INDIRECT_BUFFER to chain to a different buffer
1485 * to discard the CP prefetch cache.
1486 */
1487 sctx->ws->cs_check_space(gfx_cs, 0, true);
1488 } else {
1489 radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
1490 radeon_emit(gfx_cs, 0);
1491 }
1492 }
1493
1494 sctx->compute_num_prims_in_batch += num_subdraw_prims;
1495
1496 uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
1497 uint64_t index_va = out_indexbuf_va + start_prim * 12;
1498
1499 /* Emit the draw packet into the gfx IB. */
1500 radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
1501 radeon_emit(gfx_cs, num_prims * vertices_per_prim);
1502 radeon_emit(gfx_cs, index_va);
1503 radeon_emit(gfx_cs, index_va >> 32);
1504 radeon_emit(gfx_cs, 0);
1505 radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
1506
1507 /* Continue with the compute IB. */
1508 if (start_prim == 0) {
1509 uint32_t gds_prim_restart_continue_bit = 0;
1510
1511 if (sctx->preserve_prim_restart_gds_at_flush) {
1512 assert(primitive_restart &&
1513 info->mode == PIPE_PRIM_TRIANGLE_STRIP);
1514 assert(start_prim < 1 << 31);
1515 gds_prim_restart_continue_bit = 1 << 31;
1516 }
1517
1518 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
1519 radeon_emit(cs, index_buffers_va);
1520 radeon_emit(cs,
1521 VERTEX_COUNTER_GDS_MODE == 0 ? count_va :
1522 VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset :
1523 start_prim |
1524 gds_prim_restart_continue_bit);
1525 radeon_emit(cs, start_prim + num_subdraw_prims - 1);
1526 radeon_emit(cs, count_va);
1527 radeon_emit(cs, vb_desc_va);
1528 radeon_emit(cs, vs_const_desc_va);
1529 radeon_emit(cs, vs_sampler_desc_va);
1530 radeon_emit(cs, base_vertex);
1531 radeon_emit(cs, info->start_instance);
1532 radeon_emit(cs, num_prims_udiv.multiplier);
1533 radeon_emit(cs, num_prims_udiv.post_shift |
1534 (num_prims_per_instance << 5));
1535 radeon_emit(cs, info->restart_index);
1536 /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
1537 radeon_emit(cs, fui(small_prim_cull_precision));
1538 } else {
1539 assert(VERTEX_COUNTER_GDS_MODE == 2);
1540 /* Only update the SGPRs that changed. */
1541 radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
1542 radeon_emit(cs, start_prim);
1543 radeon_emit(cs, start_prim + num_subdraw_prims - 1);
1544 radeon_emit(cs, count_va);
1545 }
1546
1547 /* Set grid dimensions. */
1548 unsigned start_block = start_prim / THREADGROUP_SIZE;
1549 unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
1550 unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
1551
1552 radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
1553 radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
1554 S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
1555 S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
1556
1557 radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
1558 PKT3_SHADER_TYPE_S(1));
1559 radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
1560 radeon_emit(cs, 1);
1561 radeon_emit(cs, 1);
1562 radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) |
1563 S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
1564 S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
1565 S_00B800_ORDER_MODE(0 /* launch in order */));
1566
1567 /* This is only for unordered append. Ordered append writes this from
1568 * the shader.
1569 *
1570 * Note that EOP and EOS events are super slow, so emulating the event
1571 * in a shader is an important optimization.
1572 */
1573 if (VERTEX_COUNTER_GDS_MODE == 1) {
1574 si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
1575 sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
1576 EOP_INT_SEL_NONE,
1577 EOP_DATA_SEL_GDS,
1578 NULL,
1579 count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
1580 EOP_DATA_GDS(gds_offset / 4, 1),
1581 SI_NOT_QUERY);
1582
1583 /* Now that compute shaders are running, clear the remainder of GDS. */
1584 if (first_dispatch) {
1585 unsigned offset = gds_offset + gds_size;
1586 si_cp_dma_clear_buffer(sctx, cs, NULL, offset,
1587 GDS_SIZE_UNORDERED - offset,
1588 0,
1589 SI_CPDMA_SKIP_CHECK_CS_SPACE |
1590 SI_CPDMA_SKIP_GFX_SYNC |
1591 SI_CPDMA_SKIP_SYNC_BEFORE,
1592 SI_COHERENCY_NONE, L2_BYPASS);
1593 }
1594 }
1595 first_dispatch = false;
1596
1597 assert(cs->current.cdw <= cs->current.max_dw);
1598 assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
1599 }
1600 }