src/gallium/drivers/radeonsi/si_compute_prim_discard.c

   1 /*
   2  * Copyright 2019 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include "si_pipe.h"
  27 #include "si_shader_internal.h"
  28 #include "sid.h"
  29 #include "si_build_pm4.h"
  30 #include "ac_llvm_cull.h"
  31
  32 #include "util/u_prim.h"
  33 #include "util/u_suballoc.h"
  34 #include "util/u_upload_mgr.h"
  35 #include "util/fast_idiv_by_const.h"
  36
  37 /* Based on:
  38  * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
  39  */
  40
  41 /* This file implements primitive culling using asynchronous compute.
  42  * It's written to be GL conformant.
  43  *
  44  * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it
  45  * in a compute shader. The shader processes 1 primitive/thread by invoking
  46  * the VS for each vertex to get the positions, decomposes strips and fans
  47  * into triangles (if needed), eliminates primitive restart (if needed),
  48  * does (W<0) culling, face culling, view XY culling, zero-area and
  49  * small-primitive culling, and generates a new index buffer that doesn't
  50  * contain culled primitives.
  51  *
  52  * The index buffer is generated using the Ordered Count feature of GDS,
  53  * which is an atomic counter that is incremented in the wavefront launch
  54  * order, so that the original primitive order is preserved.
  55  *
  56  * Another GDS ordered counter is used to eliminate primitive restart indices.
  57  * If a restart index lands on an even thread ID, the compute shader has to flip
  58  * the primitive orientation of the whole following triangle strip. The primitive
  59  * orientation has to be correct after strip and fan decomposition for two-sided
  60  * shading to behave correctly. The decomposition also needs to be aware of
  61  * which vertex is the provoking vertex for flat shading to behave correctly.
  62  *
  63  * IB = a GPU command buffer
  64  *
  65  * Both the compute and gfx IBs run in parallel sort of like CE and DE.
  66  * The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND
  67  * doesn't continue if its word isn't 0x80000000. Once compute shaders are
  68  * finished culling, the last wave will write the final primitive count from
  69  * GDS directly into the count word of the draw packet in the gfx IB, and
  70  * a CS_DONE event will signal the REWIND packet to continue. It's really
  71  * a direct draw with command buffer patching from the compute queue.
  72  *
  73  * The compute IB doesn't have to start when its corresponding gfx IB starts,
  74  * but can start sooner. The compute IB is signaled to start after the last
  75  * execution barrier in the *previous* gfx IB. This is handled as follows.
  76  * The kernel GPU scheduler starts the compute IB after the previous gfx IB has
  77  * started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that
  78  * represents the barrier in the previous gfx IB.
  79  *
  80  * Features:
  81  * - Triangle strips and fans are decomposed into an indexed triangle list.
  82  *   The decomposition differs based on the provoking vertex state.
  83  * - Instanced draws are converted into non-instanced draws for 16-bit indices.
  84  *   (InstanceID is stored in the high bits of VertexID and unpacked by VS)
  85  * - Primitive restart is fully supported with triangle strips, including
  86  *   correct primitive orientation across multiple waves. (restart indices
  87  *   reset primitive orientation)
  88  * - W<0 culling (W<0 is behind the viewer, sort of like near Z culling).
  89  * - Back face culling, incl. culling zero-area / degenerate primitives.
  90  * - View XY culling.
  91  * - View Z culling (disabled due to limited impact with perspective projection).
  92  * - Small primitive culling for all MSAA modes and all quant modes.
  93  *
  94  * The following are not implemented:
  95  * - ClipVertex/ClipDistance/CullDistance-based culling.
  96  * - Scissor culling.
  97  * - HiZ culling.
  98  *
  99  * Limitations (and unimplemented features that may be possible to implement):
 100  * - Only triangles, triangle strips, and triangle fans are supported.
 101  * - Primitive restart is only supported with triangle strips.
 102  * - Instancing and primitive restart can't be used together.
 103  * - Instancing is only supported with 16-bit indices and instance count <= 2^16.
 104  * - The instance divisor buffer is unavailable, so all divisors must be
 105  *   either 0 or 1.
 106  * - Multidraws where the vertex shader reads gl_DrawID are unsupported.
 107  * - No support for tessellation and geometry shaders.
 108  *   (patch elimination where tess factors are 0 would be possible to implement)
 109  * - The vertex shader must not contain memory stores.
 110  * - All VS resources must not have a write usage in the command buffer.
 111  * - Bindless textures and images must not occur in the vertex shader.
 112  *
 113  * User data SGPR layout:
 114  *   INDEX_BUFFERS: pointer to constants
 115  *     0..3: input index buffer - typed buffer view
 116  *     4..7: output index buffer - typed buffer view
 117  *     8..11: viewport state - scale.xy, translate.xy
 118  *   VERTEX_COUNTER: counter address or first primitive ID
 119  *     - If unordered memory counter: address of "count" in the draw packet
 120  *       and is incremented atomically by the shader.
 121  *     - If unordered GDS counter: address of "count" in GDS starting from 0,
 122  *       must be initialized to 0 before the dispatch.
 123  *     - If ordered GDS counter: the primitive ID that should reset the vertex
 124  *       counter to 0 in GDS
 125  *   LAST_WAVE_PRIM_ID: the primitive ID that should write the final vertex
 126  *       count to memory if using GDS ordered append
 127  *   VERTEX_COUNT_ADDR: where the last wave should write the vertex count if
 128  *       using GDS ordered append
 129  *   VS.VERTEX_BUFFERS:           same value as VS
 130  *   VS.CONST_AND_SHADER_BUFFERS: same value as VS
 131  *   VS.SAMPLERS_AND_IMAGES:      same value as VS
 132  *   VS.BASE_VERTEX:              same value as VS
 133  *   VS.START_INSTANCE:           same value as VS
 134  *   NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives
 135  *       per instance for instancing.
 136  *   NUM_PRIMS_UDIV_TERMS:
 137  *     - Bits [0:4]: "post_shift" for fast 31-bit division for instancing.
 138  *     - Bits [5:31]: The number of primitives per instance for computing the remainder.
 139  *   PRIMITIVE_RESTART_INDEX
 140  *   SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
 141  *
 142  *
 143  * The code contains 3 codepaths:
 144  * - Unordered memory counter (for debugging, random primitive order, no primitive restart)
 145  * - Unordered GDS counter (for debugging, random primitive order, no primitive restart)
 146  * - Ordered GDS counter (it preserves the primitive order)
 147  *
 148  * How to test primitive restart (the most complicated part because it needs
 149  * to get the primitive orientation right):
 150  *   Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave
 151  *   primitive orientation flips with small draw calls, which is what most tests use.
 152  *   You can also enable draw call splitting into draw calls with just 2 primitives.
 153  */
 154
 155 /* At least 256 is needed for the fastest wave launch rate from compute queues
 156  * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
 157 #define THREADGROUP_SIZE                256 /* high numbers limit available VGPRs */
 158 #define THREADGROUPS_PER_CU             1 /* TGs to launch on 1 CU before going onto the next, max 8 */
 159 #define MAX_WAVES_PER_SH                0 /* no limit */
 160 #define INDEX_STORES_USE_SLC            1 /* don't cache indices if L2 is full */
 161 /* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */
 162 #define CULL_Z                          0
 163 /* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */
 164 #define VERTEX_COUNTER_GDS_MODE         2
 165 #define GDS_SIZE_UNORDERED              (4 * 1024) /* only for the unordered GDS counter */
 166
 167 /* Grouping compute dispatches for small draw calls: How many primitives from multiple
 168  * draw calls to process by compute before signaling the gfx IB. This reduces the number
 169  * of EOP events + REWIND packets, because they decrease performance. */
 170 #define PRIMS_PER_BATCH                 (512 * 1024)
 171 /* Draw call splitting at the packet level. This allows signaling the gfx IB
 172  * for big draw calls sooner, but doesn't allow context flushes between packets.
 173  * Primitive restart is supported. Only implemented for ordered append. */
 174 #define SPLIT_PRIMS_PACKET_LEVEL_VALUE  PRIMS_PER_BATCH
 175 /* If there is not enough ring buffer space for the current IB, split draw calls into
 176  * this number of primitives, so that we can flush the context and get free ring space. */
 177 #define SPLIT_PRIMS_DRAW_LEVEL          PRIMS_PER_BATCH
 178
 179 /* Derived values. */
 180 #define WAVES_PER_TG                    DIV_ROUND_UP(THREADGROUP_SIZE, 64)
 181 #define SPLIT_PRIMS_PACKET_LEVEL        (VERTEX_COUNTER_GDS_MODE == 2 ? \
 182                                          SPLIT_PRIMS_PACKET_LEVEL_VALUE : \
 183                                          UINT_MAX & ~(THREADGROUP_SIZE - 1))
 184
 185 #define REWIND_SIGNAL_BIT               0x80000000
 186 /* For emulating the rewind packet on CI. */
 187 #define FORCE_REWIND_EMULATION          0
 188
 189 void si_initialize_prim_discard_tunables(struct si_screen *sscreen,
 190                                          bool is_aux_context,
 191                                          unsigned *prim_discard_vertex_count_threshold,
 192                                          unsigned *index_ring_size_per_ib)
 193 {
 194         *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
 195
 196         if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */
 197             !sscreen->info.has_gds_ordered_append ||
 198             sscreen->debug_flags & DBG(NO_PD) ||
 199             is_aux_context)
 200                 return;
 201
 202         /* TODO: enable this after the GDS kernel memory management is fixed */
 203         bool enable_on_pro_graphics_by_default = false;
 204
 205         if (sscreen->debug_flags & DBG(ALWAYS_PD) ||
 206             sscreen->debug_flags & DBG(PD) ||
 207             (enable_on_pro_graphics_by_default &&
 208              sscreen->info.is_pro_graphics &&
 209              (sscreen->info.family == CHIP_BONAIRE ||
 210               sscreen->info.family == CHIP_HAWAII ||
 211               sscreen->info.family == CHIP_TONGA ||
 212               sscreen->info.family == CHIP_FIJI ||
 213               sscreen->info.family == CHIP_POLARIS10 ||
 214               sscreen->info.family == CHIP_POLARIS11 ||
 215               sscreen->info.family == CHIP_VEGA10 ||
 216               sscreen->info.family == CHIP_VEGA20))) {
 217                 *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
 218
 219                 if (sscreen->debug_flags & DBG(ALWAYS_PD))
 220                         *prim_discard_vertex_count_threshold = 0; /* always enable */
 221
 222                 const uint32_t MB = 1024 * 1024;
 223                 const uint64_t GB = 1024 * 1024 * 1024;
 224
 225                 /* The total size is double this per context.
 226                  * Greater numbers allow bigger gfx IBs.
 227                  */
 228                 if (sscreen->info.vram_size <= 2 * GB)
 229                         *index_ring_size_per_ib = 64 * MB;
 230                 else if (sscreen->info.vram_size <= 4 * GB)
 231                         *index_ring_size_per_ib = 128 * MB;
 232                 else
 233                         *index_ring_size_per_ib = 256 * MB;
 234         }
 235 }
 236
 237 /* Opcode can be "add" or "swap". */
 238 static LLVMValueRef
 239 si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
 240                        LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index,
 241                        bool release, bool done)
 242 {
 243         if (ctx->screen->info.chip_class >= GFX10)
 244                 ordered_count_index |= 1 << 24; /* number of dwords == 1 */
 245
 246         LLVMValueRef args[] = {
 247                 LLVMBuildIntToPtr(ctx->ac.builder, m0,
 248                                   LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""),
 249                 value,
 250                 LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
 251                 ctx->ac.i32_0, /* scope */
 252                 ctx->ac.i1false, /* volatile */
 253                 LLVMConstInt(ctx->ac.i32, ordered_count_index, 0),
 254                 LLVMConstInt(ctx->ac.i1, release, 0),
 255                 LLVMConstInt(ctx->ac.i1, done, 0),
 256         };
 257
 258         char intrinsic[64];
 259         snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
 260         return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0);
 261 }
 262
 263 static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
 264 {
 265         uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
 266         ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");
 267         ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");
 268         return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
 269                                  LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");
 270 }
 271
 272 struct si_thread0_section {
 273         struct si_shader_context *ctx;
 274         LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
 275         LLVMValueRef saved_exec;
 276 };
 277
 278 /* Enter a section that only executes on thread 0. */
 279 static void si_enter_thread0_section(struct si_shader_context *ctx,
 280                                      struct si_thread0_section *section,
 281                                      LLVMValueRef thread_id)
 282 {
 283         section->ctx = ctx;
 284         section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");
 285
 286         /* This IF has 4 instructions:
 287          *   v_and_b32_e32 v, 63, v         ; get the thread ID
 288          *   v_cmp_eq_u32_e32 vcc, 0, v     ; thread ID == 0
 289          *   s_and_saveexec_b64 s, vcc
 290          *   s_cbranch_execz BB0_4
 291          *
 292          * It could just be s_and_saveexec_b64 s, 1.
 293          */
 294         ac_build_ifcc(&ctx->ac,
 295                       LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id,
 296                                     ctx->ac.i32_0, ""), 12601);
 297 }
 298
 299 /* Exit a section that only executes on thread 0 and broadcast the result
 300  * to all threads. */
 301 static void si_exit_thread0_section(struct si_thread0_section *section,
 302                                     LLVMValueRef *result)
 303 {
 304         struct si_shader_context *ctx = section->ctx;
 305
 306         LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
 307
 308         ac_build_endif(&ctx->ac, 12601);
 309
 310         /* Broadcast the result from thread 0 to all threads. */
 311         *result = ac_build_readlane(&ctx->ac,
 312                         LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
 313 }
 314
 315 void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
 316 {
 317         struct si_shader_key *key = &ctx->shader->key;
 318         LLVMBuilderRef builder = ctx->ac.builder;
 319         LLVMValueRef vs = ctx->main_fn;
 320
 321         /* Always inline the VS function. */
 322         ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
 323         LLVMSetLinkage(vs, LLVMPrivateLinkage);
 324
 325         enum ac_arg_type const_desc_type;
 326         if (ctx->shader->selector->info.const_buffers_declared == 1 &&
 327             ctx->shader->selector->info.shader_buffers_declared == 0)
 328                 const_desc_type = AC_ARG_CONST_FLOAT_PTR;
 329         else
 330                 const_desc_type = AC_ARG_CONST_DESC_PTR;
 331
 332         memset(&ctx->args, 0, sizeof(ctx->args));
 333
 334         struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
 335         struct ac_arg param_vb_desc, param_const_desc;
 336         struct ac_arg param_base_vertex, param_start_instance;
 337         struct ac_arg param_block_id, param_local_id, param_ordered_wave_id;
 338         struct ac_arg param_restart_index, param_smallprim_precision;
 339         struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
 340         struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr;
 341
 342         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
 343                    &param_index_buffers_and_constants);
 344         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_counter);
 345         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_last_wave_prim_id);
 346         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_count_addr);
 347         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
 348                    &param_vb_desc);
 349         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type,
 350                    &param_const_desc);
 351         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
 352                    &param_sampler_desc);
 353         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_base_vertex);
 354         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_instance);
 355         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_multiplier);
 356         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_terms);
 357         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_restart_index);
 358         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &param_smallprim_precision);
 359
 360         /* Block ID and thread ID inputs. */
 361         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_block_id);
 362         if (VERTEX_COUNTER_GDS_MODE == 2)
 363                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_ordered_wave_id);
 364         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &param_local_id);
 365
 366         /* Create the compute shader function. */
 367         unsigned old_type = ctx->type;
 368         ctx->type = PIPE_SHADER_COMPUTE;
 369         si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
 370         ctx->type = old_type;
 371
 372         if (VERTEX_COUNTER_GDS_MODE == 2) {
 373                 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
 374                                                      "amdgpu-gds-size", 256);
 375         } else if (VERTEX_COUNTER_GDS_MODE == 1) {
 376                 ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size",
 377                                                      GDS_SIZE_UNORDERED);
 378         }
 379
 380         /* Assemble parameters for VS. */
 381         LLVMValueRef vs_params[16];
 382         unsigned num_vs_params = 0;
 383         unsigned param_vertex_id, param_instance_id;
 384
 385         vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
 386         vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
 387         vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
 388         vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
 389         vs_params[num_vs_params++] = LLVMConstInt(ctx->ac.i32,
 390                                         S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
 391         vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
 392         vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
 393         vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
 394         vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
 395
 396         vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
 397         vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
 398         vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */
 399         vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */
 400
 401         assert(num_vs_params <= ARRAY_SIZE(vs_params));
 402         assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
 403
 404         /* Load descriptors. (load 8 dwords at once) */
 405         LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
 406
 407         LLVMValueRef index_buffers_and_constants = ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
 408         tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
 409                                    ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
 410         tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
 411
 412         for (unsigned i = 0; i < 8; i++)
 413                 desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
 414
 415         input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
 416         output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
 417
 418         /* Compute PrimID and InstanceID. */
 419         LLVMValueRef global_thread_id =
 420                 ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
 421                               LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
 422                               ac_get_arg(&ctx->ac, param_local_id));
 423         LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
 424         LLVMValueRef instance_id = ctx->ac.i32_0;
 425
 426         if (key->opt.cs_instancing) {
 427                 LLVMValueRef num_prims_udiv_terms =
 428                         ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
 429                 LLVMValueRef num_prims_udiv_multiplier =
 430                         ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
 431                 /* Unpack num_prims_udiv_terms. */
 432                 LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms,
 433                                                        LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
 434                 LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms,
 435                                                                 LLVMConstInt(ctx->ac.i32, 5, 0), "");
 436                 /* Divide the total prim_id by the number of prims per instance. */
 437                 instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id,
 438                                                                num_prims_udiv_multiplier,
 439                                                                post_shift);
 440                 /* Compute the remainder. */
 441                 prim_id = LLVMBuildSub(builder, prim_id,
 442                                        LLVMBuildMul(builder, instance_id,
 443                                                     prims_per_instance, ""), "");
 444         }
 445
 446         /* Generate indices (like a non-indexed draw call). */
 447         LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
 448         unsigned vertices_per_prim = 3;
 449
 450         switch (key->opt.cs_prim_type) {
 451         case PIPE_PRIM_TRIANGLES:
 452                 for (unsigned i = 0; i < 3; i++) {
 453                         index[i] = ac_build_imad(&ctx->ac, prim_id,
 454                                                  LLVMConstInt(ctx->ac.i32, 3, 0),
 455                                                  LLVMConstInt(ctx->ac.i32, i, 0));
 456                 }
 457                 break;
 458         case PIPE_PRIM_TRIANGLE_STRIP:
 459                 for (unsigned i = 0; i < 3; i++) {
 460                         index[i] = LLVMBuildAdd(builder, prim_id,
 461                                                 LLVMConstInt(ctx->ac.i32, i, 0), "");
 462                 }
 463                 break;
 464         case PIPE_PRIM_TRIANGLE_FAN:
 465                 /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
 466                  * and rasterizer as a normal triangle, so we need to put the provoking
 467                  * vertex into the correct index variable and preserve orientation at the same time.
 468                  * gl_VertexID is preserved, because it's equal to the index.
 469                  */
 470                 if (key->opt.cs_provoking_vertex_first) {
 471                         index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
 472                         index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
 473                         index[2] = ctx->ac.i32_0;
 474                 } else {
 475                         index[0] = ctx->ac.i32_0;
 476                         index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
 477                         index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
 478                 }
 479                 break;
 480         default:
 481                 unreachable("unexpected primitive type");
 482         }
 483
 484         /* Fetch indices. */
 485         if (key->opt.cs_indexed) {
 486                 for (unsigned i = 0; i < 3; i++) {
 487                         index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf,
 488                                                                index[i], ctx->ac.i32_0, 1,
 489                                                                0, true);
 490                         index[i] = ac_to_integer(&ctx->ac, index[i]);
 491                 }
 492         }
 493
 494         LLVMValueRef ordered_wave_id = NULL;
 495
 496         /* Extract the ordered wave ID. */
 497         if (VERTEX_COUNTER_GDS_MODE == 2) {
 498                 ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id);
 499                 ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id,
 500                                                 LLVMConstInt(ctx->ac.i32, 6, 0), "");
 501                 ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id,
 502                                                LLVMConstInt(ctx->ac.i32, 0xfff, 0), "");
 503         }
 504         LLVMValueRef thread_id =
 505                 LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
 506                              LLVMConstInt(ctx->ac.i32, 63, 0), "");
 507
 508         /* Every other triangle in a strip has a reversed vertex order, so we
 509          * need to swap vertices of odd primitives to get the correct primitive
 510          * orientation when converting triangle strips to triangles. Primitive
 511          * restart complicates it, because a strip can start anywhere.
 512          */
 513         LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
 514         LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
 515
 516         if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
 517                 /* Without primitive restart, odd primitives have reversed orientation.
 518                  * Only primitive restart can flip it with respect to the first vertex
 519                  * of the draw call.
 520                  */
 521                 LLVMValueRef first_is_odd = ctx->ac.i1false;
 522
 523                 /* Handle primitive restart. */
 524                 if (key->opt.cs_primitive_restart) {
 525                         /* Get the GDS primitive restart continue flag and clear
 526                          * the flag in vertex_counter. This flag is used when the draw
 527                          * call was split and we need to load the primitive orientation
 528                          * flag from GDS for the first wave too.
 529                          */
 530                         LLVMValueRef gds_prim_restart_continue =
 531                                 LLVMBuildLShr(builder, vertex_counter,
 532                                               LLVMConstInt(ctx->ac.i32, 31, 0), "");
 533                         gds_prim_restart_continue =
 534                                 LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, "");
 535                         vertex_counter = LLVMBuildAnd(builder, vertex_counter,
 536                                                       LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), "");
 537
 538                         LLVMValueRef index0_is_reset;
 539
 540                         for (unsigned i = 0; i < 3; i++) {
 541                                 LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
 542                                                                        ac_get_arg(&ctx->ac, param_restart_index),
 543                                                                        "");
 544                                 if (i == 0)
 545                                         index0_is_reset = LLVMBuildNot(builder, not_reset, "");
 546                                 prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted,
 547                                                                      not_reset, "");
 548                         }
 549
 550                         /* If the previous waves flip the primitive orientation
 551                          * of the current triangle strip, it will be stored in GDS.
 552                          *
 553                          * Sometimes the correct orientation is not needed, in which case
 554                          * we don't need to execute this.
 555                          */
 556                         if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
 557                                 /* If there are reset indices in this wave, get the thread index
 558                                  * where the most recent strip starts relative to each thread.
 559                                  */
 560                                 LLVMValueRef preceding_threads_mask =
 561                                         LLVMBuildSub(builder,
 562                                                      LLVMBuildShl(builder, ctx->ac.i64_1,
 563                                                                   LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""),
 564                                                      ctx->ac.i64_1, "");
 565
 566                                 LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
 567                                 LLVMValueRef preceding_reset_threadmask =
 568                                         LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
 569                                 LLVMValueRef strip_start =
 570                                         ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
 571                                 strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, "");
 572
 573                                 /* This flips the orientatino based on reset indices within this wave only. */
 574                                 first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, "");
 575
 576                                 LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
 577                                 LLVMValueRef is_first_wave, current_wave_resets_index;
 578
 579                                 /* Get the thread index where the last strip starts in this wave.
 580                                  *
 581                                  * If the last strip doesn't start in this wave, the thread index
 582                                  * will be 0.
 583                                  *
 584                                  * If the last strip starts in the next wave, the thread index will
 585                                  * be 64.
 586                                  */
 587                                 last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
 588                                 last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, "");
 589
 590                                 struct si_thread0_section section;
 591                                 si_enter_thread0_section(ctx, &section, thread_id);
 592
 593                                 /* This must be done in the thread 0 section, because
 594                                  * we expect PrimID to be 0 for the whole first wave
 595                                  * in this expression.
 596                                  *
 597                                  * NOTE: This will need to be different if we wanna support
 598                                  * instancing with primitive restart.
 599                                  */
 600                                 is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, "");
 601                                 is_first_wave = LLVMBuildAnd(builder, is_first_wave,
 602                                                              LLVMBuildNot(builder,
 603                                                                           gds_prim_restart_continue, ""), "");
 604                                 current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE,
 605                                                                           last_strip_start, ctx->ac.i32_0, "");
 606
 607                                 ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state");
 608
 609                                 /* Save the last strip start primitive index in GDS and read
 610                                  * the value that previous waves stored.
 611                                  *
 612                                  * if (is_first_wave || current_wave_resets_strip)
 613                                  *    // Read the value that previous waves stored and store a new one.
 614                                  *    first_is_odd = ds.ordered.swap(last_strip_start);
 615                                  * else
 616                                  *    // Just read the value that previous waves stored.
 617                                  *    first_is_odd = ds.ordered.add(0);
 618                                  */
 619                                 ac_build_ifcc(&ctx->ac,
 620                                               LLVMBuildOr(builder, is_first_wave,
 621                                                           current_wave_resets_index, ""), 12602);
 622                                 {
 623                                         /* The GDS address is always 0 with ordered append. */
 624                                         tmp = si_build_ds_ordered_op(ctx, "swap",
 625                                                                      ordered_wave_id, last_strip_start,
 626                                                                      1, true, false);
 627                                         LLVMBuildStore(builder, tmp, ret);
 628                                 }
 629                                 ac_build_else(&ctx->ac, 12603);
 630                                 {
 631                                         /* Just read the value from GDS. */
 632                                         tmp = si_build_ds_ordered_op(ctx, "add",
 633                                                                      ordered_wave_id, ctx->ac.i32_0,
 634                                                                      1, true, false);
 635                                         LLVMBuildStore(builder, tmp, ret);
 636                                 }
 637                                 ac_build_endif(&ctx->ac, 12602);
 638
 639                                 prev_wave_state = LLVMBuildLoad(builder, ret, "");
 640                                 /* Ignore the return value if this is the first wave. */
 641                                 prev_wave_state = LLVMBuildSelect(builder, is_first_wave,
 642                                                                   ctx->ac.i32_0, prev_wave_state, "");
 643                                 si_exit_thread0_section(&section, &prev_wave_state);
 644                                 prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, "");
 645
 646                                 /* If the strip start appears to be on thread 0 for the current primitive
 647                                  * (meaning the reset index is not present in this wave and might have
 648                                  * appeared in previous waves), use the value from GDS to determine
 649                                  * primitive orientation.
 650                                  *
 651                                  * If the strip start is in this wave for the current primitive, use
 652                                  * the value from the current wave to determine primitive orientation.
 653                                  */
 654                                 LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ,
 655                                                                              strip_start, ctx->ac.i32_0, "");
 656                                 first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state,
 657                                                                first_is_odd, "");
 658                         }
 659                 }
 660                 /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
 661                 LLVMValueRef prim_is_odd =
 662                         LLVMBuildXor(builder, first_is_odd,
 663                                      LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
 664
 665                 /* Convert triangle strip indices to triangle indices. */
 666                 ac_build_triangle_strip_indices_to_triangle(&ctx->ac, prim_is_odd,
 667                                                             LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
 668                                                             index);
 669         }
 670
 671         /* Execute the vertex shader for each vertex to get vertex positions. */
 672         LLVMValueRef pos[3][4];
 673         for (unsigned i = 0; i < vertices_per_prim; i++) {
 674                 vs_params[param_vertex_id] = index[i];
 675                 vs_params[param_instance_id] = instance_id;
 676
 677                 LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
 678                 for (unsigned chan = 0; chan < 4; chan++)
 679                         pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
 680         }
 681
 682         /* Divide XYZ by W. */
 683         for (unsigned i = 0; i < vertices_per_prim; i++) {
 684                 for (unsigned chan = 0; chan < 3; chan++)
 685                         pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
 686         }
 687
 688         /* Load the viewport state. */
 689         LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
 690                                                   LLVMConstInt(ctx->ac.i32, 2, 0));
 691         vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
 692         LLVMValueRef vp_scale[2], vp_translate[2];
 693         vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
 694         vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
 695         vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
 696         vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
 697
 698         /* Do culling. */
 699         struct ac_cull_options options = {};
 700         options.cull_front = key->opt.cs_cull_front;
 701         options.cull_back = key->opt.cs_cull_back;
 702         options.cull_view_xy = true;
 703         options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
 704         options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
 705         options.cull_small_prims = true;
 706         options.cull_zero_area = true;
 707         options.cull_w = true;
 708         options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
 709
 710         LLVMValueRef accepted =
 711                 ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted,
 712                                  vp_scale, vp_translate,
 713                                  ac_get_arg(&ctx->ac, param_smallprim_precision),
 714                                  &options);
 715
 716         ac_build_optimization_barrier(&ctx->ac, &accepted);
 717         LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
 718
 719         /* Count the number of active threads by doing bitcount(accepted). */
 720         LLVMValueRef num_prims_accepted =
 721                 ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->ac.i64,
 722                                    &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
 723         num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
 724
 725         LLVMValueRef start;
 726
 727         /* Execute atomic_add on the vertex count. */
 728         struct si_thread0_section section;
 729         si_enter_thread0_section(ctx, &section, thread_id);
 730         {
 731                 if (VERTEX_COUNTER_GDS_MODE == 0) {
 732                         LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
 733                                                 LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
 734                         vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
 735                         start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
 736                                                    vertex_counter, num_indices,
 737                                                    LLVMAtomicOrderingMonotonic, false);
 738                 } else if (VERTEX_COUNTER_GDS_MODE == 1) {
 739                         LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
 740                                                 LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
 741                         vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
 742                                                            LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), "");
 743                         start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
 744                                                    vertex_counter, num_indices,
 745                                                    LLVMAtomicOrderingMonotonic, false);
 746                 } else if (VERTEX_COUNTER_GDS_MODE == 2) {
 747                         LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
 748
 749                         /* If the draw call was split into multiple subdraws, each using
 750                          * a separate draw packet, we need to start counting from 0 for
 751                          * the first compute wave of the subdraw.
 752                          *
 753                          * vertex_counter contains the primitive ID of the first thread
 754                          * in the first wave.
 755                          *
 756                          * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
 757                          */
 758                         LLVMValueRef is_first_wave =
 759                                 LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
 760                                               vertex_counter, "");
 761
 762                         /* Store the primitive count for ordered append, not vertex count.
 763                          * The idea is to avoid GDS initialization via CP DMA. The shader
 764                          * effectively stores the first count using "swap".
 765                          *
 766                          * if (first_wave) {
 767                          *    ds.ordered.swap(num_prims_accepted); // store the first primitive count
 768                          *    previous = 0;
 769                          * } else {
 770                          *    previous = ds.ordered.add(num_prims_accepted) // add the primitive count
 771                          * }
 772                          */
 773                         ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
 774                         {
 775                                 /* The GDS address is always 0 with ordered append. */
 776                                 si_build_ds_ordered_op(ctx, "swap", ordered_wave_id,
 777                                                        num_prims_accepted, 0, true, true);
 778                                 LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store);
 779                         }
 780                         ac_build_else(&ctx->ac, 12605);
 781                         {
 782                                 LLVMBuildStore(builder,
 783                                                si_build_ds_ordered_op(ctx, "add", ordered_wave_id,
 784                                                                       num_prims_accepted, 0,
 785                                                                       true, true),
 786                                                tmp_store);
 787                         }
 788                         ac_build_endif(&ctx->ac, 12604);
 789
 790                         start = LLVMBuildLoad(builder, tmp_store, "");
 791                 }
 792         }
 793         si_exit_thread0_section(&section, &start);
 794
 795         /* Write the final vertex count to memory. An EOS/EOP event could do this,
 796          * but those events are super slow and should be avoided if performance
 797          * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
 798          * event like this.
 799          */
 800         if (VERTEX_COUNTER_GDS_MODE == 2) {
 801                 ac_build_ifcc(&ctx->ac,
 802                               LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
 803                                             ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""),
 804                               12606);
 805                 LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
 806                 count = LLVMBuildMul(builder, count,
 807                                      LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
 808
 809                 /* GFX8 needs to disable caching, so that the CP can see the stored value.
 810                  * MTYPE=3 bypasses TC L2.
 811                  */
 812                 if (ctx->screen->info.chip_class <= GFX8) {
 813                         LLVMValueRef desc[] = {
 814                                 ac_get_arg(&ctx->ac, param_vertex_count_addr),
 815                                 LLVMConstInt(ctx->ac.i32,
 816                                         S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
 817                                 LLVMConstInt(ctx->ac.i32, 4, 0),
 818                                 LLVMConstInt(ctx->ac.i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
 819                                                        S_008F0C_MTYPE(3 /* uncached */), 0),
 820                         };
 821                         LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
 822                         ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0,
 823                                                     ctx->ac.i32_0, 0, ac_glc | ac_slc);
 824                 } else {
 825                         LLVMBuildStore(builder, count,
 826                                        si_expand_32bit_pointer(ctx,
 827                                                                ac_get_arg(&ctx->ac,
 828                                                                           param_vertex_count_addr)));
 829                 }
 830                 ac_build_endif(&ctx->ac, 12606);
 831         } else {
 832                 /* For unordered modes that increment a vertex count instead of
 833                  * primitive count, convert it into the primitive index.
 834                  */
 835                 start = LLVMBuildUDiv(builder, start,
 836                                       LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
 837         }
 838
 839         /* Now we need to store the indices of accepted primitives into
 840          * the output index buffer.
 841          */
 842         ac_build_ifcc(&ctx->ac, accepted, 16607);
 843         {
 844                 /* Get the number of bits set before the index of this thread. */
 845                 LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
 846
 847                 /* We have lowered instancing. Pack the instance ID into vertex ID. */
 848                 if (key->opt.cs_instancing) {
 849                         instance_id = LLVMBuildShl(builder, instance_id,
 850                                                    LLVMConstInt(ctx->ac.i32, 16, 0), "");
 851
 852                         for (unsigned i = 0; i < vertices_per_prim; i++)
 853                                 index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
 854                 }
 855
 856                 if (VERTEX_COUNTER_GDS_MODE == 2) {
 857                         /* vertex_counter contains the first primitive ID
 858                          * for this dispatch. If the draw call was split into
 859                          * multiple subdraws, the first primitive ID is > 0
 860                          * for subsequent subdraws. Each subdraw uses a different
 861                          * portion of the output index buffer. Offset the store
 862                          * vindex by the first primitive ID to get the correct
 863                          * store address for the subdraw.
 864                          */
 865                         start = LLVMBuildAdd(builder, start, vertex_counter, "");
 866                 }
 867
 868                 /* Write indices for accepted primitives. */
 869                 LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
 870                 LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
 871
 872                 if (!ac_has_vec3_support(ctx->ac.chip_class, true))
 873                         vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
 874
 875                 ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata,
 876                                              vindex, ctx->ac.i32_0, 3,
 877                                              ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
 878         }
 879         ac_build_endif(&ctx->ac, 16607);
 880
 881         LLVMBuildRetVoid(builder);
 882 }
 883
 884 /* Return false if the shader isn't ready. */
 885 static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
 886                                              const struct pipe_draw_info *info,
 887                                              bool primitive_restart)
 888 {
 889         struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 890         struct si_shader_key key;
 891
 892         /* Primitive restart needs ordered counters. */
 893         assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
 894         assert(!primitive_restart || info->instance_count == 1);
 895
 896         memset(&key, 0, sizeof(key));
 897         si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
 898         assert(!key.part.vs.prolog.instance_divisor_is_fetched);
 899
 900         key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
 901         key.opt.vs_as_prim_discard_cs = 1;
 902         key.opt.cs_prim_type = info->mode;
 903         key.opt.cs_indexed = info->index_size != 0;
 904         key.opt.cs_instancing = info->instance_count > 1;
 905         key.opt.cs_primitive_restart = primitive_restart;
 906         key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
 907
 908         /* Primitive restart with triangle strips needs to preserve primitive
 909          * orientation for cases where front and back primitive orientation matters.
 910          */
 911         if (primitive_restart) {
 912                 struct si_shader_selector *ps = sctx->ps_shader.cso;
 913
 914                 key.opt.cs_need_correct_orientation =
 915                         rs->cull_front != rs->cull_back ||
 916                         ps->info.uses_frontface ||
 917                         (rs->two_side && ps->info.colors_read);
 918         }
 919
 920         if (rs->rasterizer_discard) {
 921                 /* Just for performance testing and analysis of trivial bottlenecks.
 922                  * This should result in a very short compute shader. */
 923                 key.opt.cs_cull_front = 1;
 924                 key.opt.cs_cull_back = 1;
 925         } else {
 926                 key.opt.cs_cull_front =
 927                         sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
 928                 key.opt.cs_cull_back =
 929                         sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
 930         }
 931
 932         if (!rs->depth_clamp_any && CULL_Z) {
 933                 key.opt.cs_cull_z = 1;
 934                 key.opt.cs_halfz_clip_space = rs->clip_halfz;
 935         }
 936
 937         sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
 938         sctx->cs_prim_discard_state.current = NULL;
 939
 940         if (!sctx->compiler.passes)
 941                 si_init_compiler(sctx->screen, &sctx->compiler);
 942
 943         struct si_compiler_ctx_state compiler_state;
 944         compiler_state.compiler = &sctx->compiler;
 945         compiler_state.debug = sctx->debug;
 946         compiler_state.is_debug_context = sctx->is_debug;
 947
 948         return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state,
 949                                          &compiler_state, &key, -1, true) == 0 &&
 950                /* Disallow compute shaders using the scratch buffer. */
 951                sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
 952 }
 953
 954 static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
 955 {
 956         if (sctx->index_ring)
 957                 return true;
 958
 959         if (!sctx->prim_discard_compute_cs) {
 960                 struct radeon_winsys *ws = sctx->ws;
 961                 unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED :
 962                                     VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
 963                 unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
 964
 965                 if (gds_size) {
 966                         sctx->gds = ws->buffer_create(ws, gds_size, 4,
 967                                                       RADEON_DOMAIN_GDS, 0);
 968                         if (!sctx->gds)
 969                                 return false;
 970
 971                         ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
 972                                           RADEON_USAGE_READWRITE, 0, 0);
 973                 }
 974                 if (num_oa_counters) {
 975                         assert(gds_size);
 976                         sctx->gds_oa = ws->buffer_create(ws, num_oa_counters,
 977                                                          1, RADEON_DOMAIN_OA, 0);
 978                         if (!sctx->gds_oa)
 979                                 return false;
 980
 981                         ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
 982                                           RADEON_USAGE_READWRITE, 0, 0);
 983                 }
 984
 985                 sctx->prim_discard_compute_cs =
 986                         ws->cs_add_parallel_compute_ib(sctx->gfx_cs,
 987                                                        num_oa_counters > 0);
 988                 if (!sctx->prim_discard_compute_cs)
 989                         return false;
 990         }
 991
 992         if (!sctx->index_ring) {
 993                 sctx->index_ring =
 994                         si_aligned_buffer_create(sctx->b.screen,
 995                                                  SI_RESOURCE_FLAG_UNMAPPABLE,
 996                                                  PIPE_USAGE_DEFAULT,
 997                                                  sctx->index_ring_size_per_ib * 2,
 998                                                  sctx->screen->info.pte_fragment_size);
 999                 if (!sctx->index_ring)
1000                         return false;
1001         }
1002         return true;
1003 }
1004
1005 static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
1006 {
1007         return sctx->index_ring_offset +
1008                align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
1009                sctx->index_ring_size_per_ib;
1010 }
1011
1012 enum si_prim_discard_outcome
1013 si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
1014                                       const struct pipe_draw_info *info,
1015                                       bool primitive_restart)
1016 {
1017         /* If the compute shader compilation isn't finished, this returns false. */
1018         if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
1019                 return SI_PRIM_DISCARD_DISABLED;
1020
1021         if (!si_initialize_prim_discard_cmdbuf(sctx))
1022                 return SI_PRIM_DISCARD_DISABLED;
1023
1024         struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
1025         unsigned prim = info->mode;
1026         unsigned count = info->count;
1027         unsigned instance_count = info->instance_count;
1028         unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
1029         unsigned num_prims = num_prims_per_instance * instance_count;
1030         unsigned out_indexbuf_size = num_prims * 12;
1031         bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
1032         const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
1033
1034         /* Split draws at the draw call level if the ring is full. This makes
1035          * better use of the ring space.
1036          */
1037         if (ring_full &&
1038             num_prims > split_prims_draw_level &&
1039             instance_count == 1 && /* TODO: support splitting instanced draws */
1040             (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
1041                            (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
1042                 /* Split draws. */
1043                 struct pipe_draw_info split_draw = *info;
1044                 split_draw.primitive_restart = primitive_restart;
1045
1046                 unsigned base_start = split_draw.start;
1047
1048                 if (prim == PIPE_PRIM_TRIANGLES) {
1049                         unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
1050                         assert(vert_count_per_subdraw < count);
1051
1052                         for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
1053                                 split_draw.start = base_start + start;
1054                                 split_draw.count = MIN2(count - start, vert_count_per_subdraw);
1055
1056                                 sctx->b.draw_vbo(&sctx->b, &split_draw);
1057                         }
1058                 } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
1059                         /* No primitive pair can be split, because strips reverse orientation
1060                          * for odd primitives. */
1061                         STATIC_ASSERT(split_prims_draw_level % 2 == 0);
1062
1063                         unsigned vert_count_per_subdraw = split_prims_draw_level;
1064
1065                         for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
1066                                 split_draw.start = base_start + start;
1067                                 split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
1068
1069                                 sctx->b.draw_vbo(&sctx->b, &split_draw);
1070
1071                                 if (start == 0 &&
1072                                     primitive_restart &&
1073                                     sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
1074                                         sctx->preserve_prim_restart_gds_at_flush = true;
1075                         }
1076                         sctx->preserve_prim_restart_gds_at_flush = false;
1077                 } else {
1078                         assert(0);
1079                 }
1080
1081                 return SI_PRIM_DISCARD_DRAW_SPLIT;
1082         }
1083
1084         /* Just quit if the draw call doesn't fit into the ring and can't be split. */
1085         if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
1086                 if (SI_PRIM_DISCARD_DEBUG)
1087                         puts("PD failed: draw call too big, can't be split");
1088                 return SI_PRIM_DISCARD_DISABLED;
1089         }
1090
1091         unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
1092         unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
1093                                    24 * (num_subdraws - 1) + /* subdraws */
1094                                    20; /* leave some space at the end */
1095         unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
1096
1097         if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
1098                 need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
1099         else
1100                 need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
1101
1102         if (ring_full ||
1103             (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
1104             !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
1105                 /* If the current IB is empty but the size is too small, add a NOP
1106                  * packet to force a flush and get a bigger IB.
1107                  */
1108                 if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
1109                     gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
1110                         radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
1111                         radeon_emit(gfx_cs, 0);
1112                 }
1113
1114                 si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
1115         }
1116
1117         /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
1118         struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
1119         ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
1120         assert(compute_has_space);
1121         assert(si_check_ring_space(sctx, out_indexbuf_size));
1122         return SI_PRIM_DISCARD_ENABLED;
1123 }
1124
1125 void si_compute_signal_gfx(struct si_context *sctx)
1126 {
1127         struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
1128         unsigned writeback_L2_flags = 0;
1129
1130         /* The writeback L2 flags vary with each chip generation. */
1131         /* CI needs to flush vertex indices to memory. */
1132         if (sctx->chip_class <= GFX7)
1133                 writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
1134         else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
1135                 writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
1136
1137         if (!sctx->compute_num_prims_in_batch)
1138                 return;
1139
1140         assert(sctx->compute_rewind_va);
1141
1142         /* After the queued dispatches are done and vertex counts are written to
1143          * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
1144          * the dispatches to finish, it only adds the CS_DONE event into the event
1145          * queue.
1146          */
1147         si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
1148                           sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
1149                           writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM :
1150                                                EOP_INT_SEL_NONE,
1151                           EOP_DATA_SEL_VALUE_32BIT,
1152                           NULL,
1153                           sctx->compute_rewind_va |
1154                           ((uint64_t)sctx->screen->info.address32_hi << 32),
1155                           REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
1156                           SI_NOT_QUERY);
1157
1158         sctx->compute_rewind_va = 0;
1159         sctx->compute_num_prims_in_batch = 0;
1160 }
1161
1162 /* Dispatch a primitive discard compute shader. */
1163 void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
1164                                           const struct pipe_draw_info *info,
1165                                           unsigned index_size,
1166                                           unsigned base_vertex,
1167                                           uint64_t input_indexbuf_va,
1168                                           unsigned input_indexbuf_num_elements)
1169 {
1170         struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
1171         struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
1172         unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
1173         if (!num_prims_per_instance)
1174                 return;
1175
1176         unsigned num_prims = num_prims_per_instance * info->instance_count;
1177         unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
1178
1179         switch (info->mode) {
1180         case PIPE_PRIM_TRIANGLES:
1181         case PIPE_PRIM_TRIANGLE_STRIP:
1182         case PIPE_PRIM_TRIANGLE_FAN:
1183                 vertices_per_prim = 3;
1184                 output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
1185                 gfx10_output_indexbuf_format = V_008F0C_IMG_FORMAT_32_32_32_UINT;
1186                 break;
1187         default:
1188                 unreachable("unsupported primitive type");
1189                 return;
1190         }
1191
1192         unsigned out_indexbuf_offset;
1193         uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
1194         bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
1195
1196         /* Initialize the compute IB if it's empty. */
1197         if (!sctx->prim_discard_compute_ib_initialized) {
1198                 /* 1) State initialization. */
1199                 sctx->compute_gds_offset = 0;
1200                 sctx->compute_ib_last_shader = NULL;
1201
1202                 if (sctx->last_ib_barrier_fence) {
1203                         assert(!sctx->last_ib_barrier_buf);
1204                         sctx->ws->cs_add_fence_dependency(gfx_cs,
1205                                                           sctx->last_ib_barrier_fence,
1206                                                           RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
1207                 }
1208
1209                 /* 2) IB initialization. */
1210
1211                 /* This needs to be done at the beginning of IBs due to possible
1212                  * TTM buffer moves in the kernel.
1213                  */
1214                 if (sctx->chip_class >= GFX10) {
1215                         radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
1216                         radeon_emit(cs, 0);             /* CP_COHER_CNTL */
1217                         radeon_emit(cs, 0xffffffff);    /* CP_COHER_SIZE */
1218                         radeon_emit(cs, 0xffffff);      /* CP_COHER_SIZE_HI */
1219                         radeon_emit(cs, 0);             /* CP_COHER_BASE */
1220                         radeon_emit(cs, 0);             /* CP_COHER_BASE_HI */
1221                         radeon_emit(cs, 0x0000000A);    /* POLL_INTERVAL */
1222                         radeon_emit(cs,                 /* GCR_CNTL */
1223                                     S_586_GLI_INV(V_586_GLI_ALL) |
1224                                     S_586_GLK_INV(1) | S_586_GLV_INV(1) |
1225                                     S_586_GL1_INV(1) |
1226                                     S_586_GL2_INV(1) | S_586_GL2_WB(1) |
1227                                     S_586_GLM_INV(1) | S_586_GLM_WB(1) |
1228                                     S_586_SEQ(V_586_SEQ_FORWARD));
1229                 } else {
1230                         si_emit_surface_sync(sctx, cs,
1231                                              S_0085F0_TC_ACTION_ENA(1) |
1232                                              S_0085F0_TCL1_ACTION_ENA(1) |
1233                                              S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
1234                                              S_0085F0_SH_ICACHE_ACTION_ENA(1) |
1235                                              S_0085F0_SH_KCACHE_ACTION_ENA(1));
1236                 }
1237
1238                 /* Restore the GDS prim restart counter if needed. */
1239                 if (sctx->preserve_prim_restart_gds_at_flush) {
1240                         si_cp_copy_data(sctx, cs,
1241                                         COPY_DATA_GDS, NULL, 4,
1242                                         COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4);
1243                 }
1244
1245                 si_emit_initial_compute_regs(sctx, cs);
1246
1247                 radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
1248                                   S_00B860_WAVES(sctx->scratch_waves) |
1249                                   S_00B860_WAVESIZE(0)); /* no scratch */
1250
1251                 /* Only 1D grids are launched. */
1252                 radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
1253                 radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) |
1254                                 S_00B820_NUM_THREAD_PARTIAL(1));
1255                 radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) |
1256                                 S_00B824_NUM_THREAD_PARTIAL(1));
1257
1258                 radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
1259                 radeon_emit(cs, 0);
1260                 radeon_emit(cs, 0);
1261
1262                 /* Disable ordered alloc for OA resources. */
1263                 for (unsigned i = 0; i < 2; i++) {
1264                         radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
1265                         radeon_emit(cs, S_031074_INDEX(i));
1266                         radeon_emit(cs, 0);
1267                         radeon_emit(cs, S_03107C_ENABLE(0));
1268                 }
1269
1270                 if (sctx->last_ib_barrier_buf) {
1271                         assert(!sctx->last_ib_barrier_fence);
1272                         radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf,
1273                                                   RADEON_USAGE_READ, RADEON_PRIO_FENCE);
1274                         si_cp_wait_mem(sctx, cs,
1275                                        sctx->last_ib_barrier_buf->gpu_address +
1276                                        sctx->last_ib_barrier_buf_offset, 1, 1,
1277                                        WAIT_REG_MEM_EQUAL);
1278                 }
1279
1280                 sctx->prim_discard_compute_ib_initialized = true;
1281         }
1282
1283         /* Allocate the output index buffer. */
1284         output_indexbuf_size = align(output_indexbuf_size,
1285                                      sctx->screen->info.tcc_cache_line_size);
1286         assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
1287         out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
1288         sctx->index_ring_offset += output_indexbuf_size;
1289
1290         radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
1291                                   RADEON_PRIO_SHADER_RW_BUFFER);
1292         uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
1293
1294         /* Prepare index buffer descriptors. */
1295         struct si_resource *indexbuf_desc = NULL;
1296         unsigned indexbuf_desc_offset;
1297         unsigned desc_size = 12 * 4;
1298         uint32_t *desc;
1299
1300         u_upload_alloc(sctx->b.const_uploader, 0, desc_size,
1301                        si_optimal_tcc_alignment(sctx, desc_size),
1302                        &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc,
1303                        (void**)&desc);
1304         radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
1305                                   RADEON_PRIO_DESCRIPTORS);
1306
1307         /* Input index buffer. */
1308         desc[0] = input_indexbuf_va;
1309         desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) |
1310                   S_008F04_STRIDE(index_size);
1311         desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
1312
1313         if (sctx->chip_class >= GFX10) {
1314                 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1315                           S_008F0C_FORMAT(index_size == 1 ? V_008F0C_IMG_FORMAT_8_UINT :
1316                                           index_size == 2 ? V_008F0C_IMG_FORMAT_16_UINT :
1317                                                             V_008F0C_IMG_FORMAT_32_UINT) |
1318                           S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
1319                           S_008F0C_RESOURCE_LEVEL(1);
1320         } else {
1321                 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1322                           S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
1323                           S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 :
1324                                                index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 :
1325                                                                  V_008F0C_BUF_DATA_FORMAT_32);
1326         }
1327
1328         /* Output index buffer. */
1329         desc[4] = out_indexbuf_va;
1330         desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) |
1331                   S_008F04_STRIDE(vertices_per_prim * 4);
1332         desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
1333
1334         if (sctx->chip_class >= GFX10) {
1335                 desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1336                           S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1337                           S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1338                           S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
1339                           S_008F0C_FORMAT(gfx10_output_indexbuf_format) |
1340                           S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
1341                           S_008F0C_RESOURCE_LEVEL(1);
1342         } else {
1343                 desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1344                           S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1345                           S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1346                           S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
1347                           S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
1348                           S_008F0C_DATA_FORMAT(output_indexbuf_format);
1349         }
1350
1351         /* Viewport state. */
1352         struct si_small_prim_cull_info cull_info;
1353         si_get_small_prim_cull_info(sctx, &cull_info);
1354
1355         desc[8] = fui(cull_info.scale[0]);
1356         desc[9] = fui(cull_info.scale[1]);
1357         desc[10] = fui(cull_info.translate[0]);
1358         desc[11] = fui(cull_info.translate[1]);
1359
1360         /* Better subpixel precision increases the efficiency of small
1361          * primitive culling. */
1362         unsigned num_samples = sctx->framebuffer.nr_samples;
1363         unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
1364         float small_prim_cull_precision;
1365
1366         if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
1367                 small_prim_cull_precision = num_samples / 4096.0;
1368         else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
1369                 small_prim_cull_precision = num_samples / 1024.0;
1370         else
1371                 small_prim_cull_precision = num_samples / 256.0;
1372
1373         /* Set user data SGPRs. */
1374         /* This can't be greater than 14 if we want the fastest launch rate. */
1375         unsigned user_sgprs = 13;
1376
1377         uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
1378         unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
1379         unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
1380         uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
1381         uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
1382         uint64_t vb_desc_va = sctx->vb_descriptors_buffer ?
1383                                       sctx->vb_descriptors_buffer->gpu_address +
1384                                       sctx->vb_descriptors_offset : 0;
1385         unsigned gds_offset, gds_size;
1386         struct si_fast_udiv_info32 num_prims_udiv = {};
1387
1388         if (info->instance_count > 1)
1389                 num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
1390
1391         /* Limitations on how these two are packed in the user SGPR. */
1392         assert(num_prims_udiv.post_shift < 32);
1393         assert(num_prims_per_instance < 1 << 27);
1394
1395         si_resource_reference(&indexbuf_desc, NULL);
1396
1397         bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
1398
1399         if (VERTEX_COUNTER_GDS_MODE == 1) {
1400                 gds_offset = sctx->compute_gds_offset;
1401                 gds_size = primitive_restart ? 8 : 4;
1402                 sctx->compute_gds_offset += gds_size;
1403
1404                 /* Reset the counters in GDS for the first dispatch using WRITE_DATA.
1405                  * The remainder of the GDS will be cleared after the dispatch packet
1406                  * in parallel with compute shaders.
1407                  */
1408                 if (first_dispatch) {
1409                         radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0));
1410                         radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
1411                         radeon_emit(cs, gds_offset);
1412                         radeon_emit(cs, 0);
1413                         radeon_emit(cs, 0); /* value to write */
1414                         if (gds_size == 8)
1415                                 radeon_emit(cs, 0);
1416                 }
1417         }
1418
1419         /* Set shader registers. */
1420         struct si_shader *shader = sctx->cs_prim_discard_state.current;
1421
1422         if (shader != sctx->compute_ib_last_shader) {
1423                 radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
1424                                           RADEON_PRIO_SHADER_BINARY);
1425                 uint64_t shader_va = shader->bo->gpu_address;
1426
1427                 assert(shader->config.scratch_bytes_per_wave == 0);
1428                 assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
1429
1430                 radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
1431                 radeon_emit(cs, shader_va >> 8);
1432                 radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
1433
1434                 radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
1435                 radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
1436                                 S_00B848_SGPRS(sctx->chip_class <= GFX9 ?
1437                                                (shader->config.num_sgprs - 1) / 8 : 0) |
1438                                 S_00B848_FLOAT_MODE(shader->config.float_mode) |
1439                                 S_00B848_DX10_CLAMP(1) |
1440                                 S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |
1441                                 S_00B848_WGP_MODE(sctx->chip_class >= GFX10));
1442                 radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) |
1443                                 S_00B84C_USER_SGPR(user_sgprs) |
1444                                 S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
1445                                 S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
1446                                 S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
1447                                 S_00B84C_LDS_SIZE(shader->config.lds_size));
1448
1449                 radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
1450                         ac_get_compute_resource_limits(&sctx->screen->info,
1451                                                        WAVES_PER_TG,
1452                                                        MAX_WAVES_PER_SH,
1453                                                        THREADGROUPS_PER_CU));
1454                 sctx->compute_ib_last_shader = shader;
1455         }
1456
1457         STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
1458
1459         /* Big draw calls are split into smaller dispatches and draw packets. */
1460         for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
1461                 unsigned num_subdraw_prims;
1462
1463                 if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
1464                         num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
1465                 else
1466                         num_subdraw_prims = num_prims - start_prim;
1467
1468                 /* Small dispatches are executed back to back until a specific primitive
1469                  * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
1470                  * to start drawing the batch. This batching adds latency to the gfx IB,
1471                  * but CS_DONE and REWIND are too slow.
1472                  */
1473                 if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
1474                         si_compute_signal_gfx(sctx);
1475
1476                 if (sctx->compute_num_prims_in_batch == 0) {
1477                         assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
1478                         sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
1479
1480                         if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
1481                                 radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
1482                                 radeon_emit(gfx_cs, 0);
1483
1484                                 si_cp_wait_mem(sctx, gfx_cs,
1485                                                sctx->compute_rewind_va |
1486                                                (uint64_t)sctx->screen->info.address32_hi << 32,
1487                                                REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT,
1488                                                WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
1489
1490                                 /* Use INDIRECT_BUFFER to chain to a different buffer
1491                                  * to discard the CP prefetch cache.
1492                                  */
1493                                 sctx->ws->cs_check_space(gfx_cs, 0, true);
1494                         } else {
1495                                 radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
1496                                 radeon_emit(gfx_cs, 0);
1497                         }
1498                 }
1499
1500                 sctx->compute_num_prims_in_batch += num_subdraw_prims;
1501
1502                 uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
1503                 uint64_t index_va = out_indexbuf_va + start_prim * 12;
1504
1505                 /* Emit the draw packet into the gfx IB. */
1506                 radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
1507                 radeon_emit(gfx_cs, num_prims * vertices_per_prim);
1508                 radeon_emit(gfx_cs, index_va);
1509                 radeon_emit(gfx_cs, index_va >> 32);
1510                 radeon_emit(gfx_cs, 0);
1511                 radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
1512
1513                 /* Continue with the compute IB. */
1514                 if (start_prim == 0) {
1515                         uint32_t gds_prim_restart_continue_bit = 0;
1516
1517                         if (sctx->preserve_prim_restart_gds_at_flush) {
1518                                 assert(primitive_restart &&
1519                                        info->mode == PIPE_PRIM_TRIANGLE_STRIP);
1520                                 assert(start_prim < 1 << 31);
1521                                 gds_prim_restart_continue_bit = 1 << 31;
1522                         }
1523
1524                         radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
1525                         radeon_emit(cs, index_buffers_va);
1526                         radeon_emit(cs,
1527                                     VERTEX_COUNTER_GDS_MODE == 0 ? count_va :
1528                                     VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset :
1529                                                                    start_prim |
1530                                                                    gds_prim_restart_continue_bit);
1531                         radeon_emit(cs, start_prim + num_subdraw_prims - 1);
1532                         radeon_emit(cs, count_va);
1533                         radeon_emit(cs, vb_desc_va);
1534                         radeon_emit(cs, vs_const_desc_va);
1535                         radeon_emit(cs, vs_sampler_desc_va);
1536                         radeon_emit(cs, base_vertex);
1537                         radeon_emit(cs, info->start_instance);
1538                         radeon_emit(cs, num_prims_udiv.multiplier);
1539                         radeon_emit(cs, num_prims_udiv.post_shift |
1540                                         (num_prims_per_instance << 5));
1541                         radeon_emit(cs, info->restart_index);
1542                         /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
1543                         radeon_emit(cs, fui(small_prim_cull_precision));
1544                 } else {
1545                         assert(VERTEX_COUNTER_GDS_MODE == 2);
1546                         /* Only update the SGPRs that changed. */
1547                         radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
1548                         radeon_emit(cs, start_prim);
1549                         radeon_emit(cs, start_prim + num_subdraw_prims - 1);
1550                         radeon_emit(cs, count_va);
1551                 }
1552
1553                 /* Set grid dimensions. */
1554                 unsigned start_block = start_prim / THREADGROUP_SIZE;
1555                 unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
1556                 unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
1557
1558                 radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
1559                 radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
1560                                   S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
1561                                   S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
1562
1563                 radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
1564                                 PKT3_SHADER_TYPE_S(1));
1565                 radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
1566                 radeon_emit(cs, 1);
1567                 radeon_emit(cs, 1);
1568                 radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) |
1569                                 S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
1570                                 S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
1571                                 S_00B800_ORDER_MODE(0 /* launch in order */));
1572
1573                 /* This is only for unordered append. Ordered append writes this from
1574                  * the shader.
1575                  *
1576                  * Note that EOP and EOS events are super slow, so emulating the event
1577                  * in a shader is an important optimization.
1578                  */
1579                 if (VERTEX_COUNTER_GDS_MODE == 1) {
1580                         si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
1581                                           sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
1582                                           EOP_INT_SEL_NONE,
1583                                           EOP_DATA_SEL_GDS,
1584                                           NULL,
1585                                           count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
1586                                           EOP_DATA_GDS(gds_offset / 4, 1),
1587                                           SI_NOT_QUERY);
1588
1589                         /* Now that compute shaders are running, clear the remainder of GDS. */
1590                         if (first_dispatch) {
1591                                 unsigned offset = gds_offset + gds_size;
1592                                 si_cp_dma_clear_buffer(sctx, cs, NULL, offset,
1593                                                        GDS_SIZE_UNORDERED - offset,
1594                                                        0,
1595                                                        SI_CPDMA_SKIP_CHECK_CS_SPACE |
1596                                                        SI_CPDMA_SKIP_GFX_SYNC |
1597                                                        SI_CPDMA_SKIP_SYNC_BEFORE,
1598                                                        SI_COHERENCY_NONE, L2_BYPASS);
1599                         }
1600                 }
1601                 first_dispatch = false;
1602
1603                 assert(cs->current.cdw <= cs->current.max_dw);
1604                 assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
1605         }
1606 }