src/gallium/drivers/radeonsi/si_compute_prim_discard.c

   1 /*
   2  * Copyright 2019 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include "ac_llvm_cull.h"
  27 #include "si_build_pm4.h"
  28 #include "si_pipe.h"
  29 #include "si_shader_internal.h"
  30 #include "sid.h"
  31 #include "util/fast_idiv_by_const.h"
  32 #include "util/u_prim.h"
  33 #include "util/u_suballoc.h"
  34 #include "util/u_upload_mgr.h"
  35
  36 /* Based on:
  37  * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
  38  */
  39
  40 /* This file implements primitive culling using asynchronous compute.
  41  * It's written to be GL conformant.
  42  *
  43  * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it
  44  * in a compute shader. The shader processes 1 primitive/thread by invoking
  45  * the VS for each vertex to get the positions, decomposes strips and fans
  46  * into triangles (if needed), eliminates primitive restart (if needed),
  47  * does (W<0) culling, face culling, view XY culling, zero-area and
  48  * small-primitive culling, and generates a new index buffer that doesn't
  49  * contain culled primitives.
  50  *
  51  * The index buffer is generated using the Ordered Count feature of GDS,
  52  * which is an atomic counter that is incremented in the wavefront launch
  53  * order, so that the original primitive order is preserved.
  54  *
  55  * Another GDS ordered counter is used to eliminate primitive restart indices.
  56  * If a restart index lands on an even thread ID, the compute shader has to flip
  57  * the primitive orientation of the whole following triangle strip. The primitive
  58  * orientation has to be correct after strip and fan decomposition for two-sided
  59  * shading to behave correctly. The decomposition also needs to be aware of
  60  * which vertex is the provoking vertex for flat shading to behave correctly.
  61  *
  62  * IB = a GPU command buffer
  63  *
  64  * Both the compute and gfx IBs run in parallel sort of like CE and DE.
  65  * The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND
  66  * doesn't continue if its word isn't 0x80000000. Once compute shaders are
  67  * finished culling, the last wave will write the final primitive count from
  68  * GDS directly into the count word of the draw packet in the gfx IB, and
  69  * a CS_DONE event will signal the REWIND packet to continue. It's really
  70  * a direct draw with command buffer patching from the compute queue.
  71  *
  72  * The compute IB doesn't have to start when its corresponding gfx IB starts,
  73  * but can start sooner. The compute IB is signaled to start after the last
  74  * execution barrier in the *previous* gfx IB. This is handled as follows.
  75  * The kernel GPU scheduler starts the compute IB after the previous gfx IB has
  76  * started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that
  77  * represents the barrier in the previous gfx IB.
  78  *
  79  * Features:
  80  * - Triangle strips and fans are decomposed into an indexed triangle list.
  81  *   The decomposition differs based on the provoking vertex state.
  82  * - Instanced draws are converted into non-instanced draws for 16-bit indices.
  83  *   (InstanceID is stored in the high bits of VertexID and unpacked by VS)
  84  * - Primitive restart is fully supported with triangle strips, including
  85  *   correct primitive orientation across multiple waves. (restart indices
  86  *   reset primitive orientation)
  87  * - W<0 culling (W<0 is behind the viewer, sort of like near Z culling).
  88  * - Back face culling, incl. culling zero-area / degenerate primitives.
  89  * - View XY culling.
  90  * - View Z culling (disabled due to limited impact with perspective projection).
  91  * - Small primitive culling for all MSAA modes and all quant modes.
  92  *
  93  * The following are not implemented:
  94  * - ClipVertex/ClipDistance/CullDistance-based culling.
  95  * - Scissor culling.
  96  * - HiZ culling.
  97  *
  98  * Limitations (and unimplemented features that may be possible to implement):
  99  * - Only triangles, triangle strips, and triangle fans are supported.
 100  * - Primitive restart is only supported with triangle strips.
 101  * - Instancing and primitive restart can't be used together.
 102  * - Instancing is only supported with 16-bit indices and instance count <= 2^16.
 103  * - The instance divisor buffer is unavailable, so all divisors must be
 104  *   either 0 or 1.
 105  * - Multidraws where the vertex shader reads gl_DrawID are unsupported.
 106  * - No support for tessellation and geometry shaders.
 107  *   (patch elimination where tess factors are 0 would be possible to implement)
 108  * - The vertex shader must not contain memory stores.
 109  * - All VS resources must not have a write usage in the command buffer.
 110  * - Bindless textures and images must not occur in the vertex shader.
 111  *
 112  * User data SGPR layout:
 113  *   INDEX_BUFFERS: pointer to constants
 114  *     0..3: input index buffer - typed buffer view
 115  *     4..7: output index buffer - typed buffer view
 116  *     8..11: viewport state - scale.xy, translate.xy
 117  *   VERTEX_COUNTER: counter address or first primitive ID
 118  *     - If unordered memory counter: address of "count" in the draw packet
 119  *       and is incremented atomically by the shader.
 120  *     - If unordered GDS counter: address of "count" in GDS starting from 0,
 121  *       must be initialized to 0 before the dispatch.
 122  *     - If ordered GDS counter: the primitive ID that should reset the vertex
 123  *       counter to 0 in GDS
 124  *   LAST_WAVE_PRIM_ID: the primitive ID that should write the final vertex
 125  *       count to memory if using GDS ordered append
 126  *   VERTEX_COUNT_ADDR: where the last wave should write the vertex count if
 127  *       using GDS ordered append
 128  *   VS.VERTEX_BUFFERS:           same value as VS
 129  *   VS.CONST_AND_SHADER_BUFFERS: same value as VS
 130  *   VS.SAMPLERS_AND_IMAGES:      same value as VS
 131  *   VS.BASE_VERTEX:              same value as VS
 132  *   VS.START_INSTANCE:           same value as VS
 133  *   NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives
 134  *       per instance for instancing.
 135  *   NUM_PRIMS_UDIV_TERMS:
 136  *     - Bits [0:4]: "post_shift" for fast 31-bit division for instancing.
 137  *     - Bits [5:31]: The number of primitives per instance for computing the remainder.
 138  *   PRIMITIVE_RESTART_INDEX
 139  *   SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
 140  *
 141  *
 142  * The code contains 3 codepaths:
 143  * - Unordered memory counter (for debugging, random primitive order, no primitive restart)
 144  * - Unordered GDS counter (for debugging, random primitive order, no primitive restart)
 145  * - Ordered GDS counter (it preserves the primitive order)
 146  *
 147  * How to test primitive restart (the most complicated part because it needs
 148  * to get the primitive orientation right):
 149  *   Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave
 150  *   primitive orientation flips with small draw calls, which is what most tests use.
 151  *   You can also enable draw call splitting into draw calls with just 2 primitives.
 152  */
 153
 154 /* At least 256 is needed for the fastest wave launch rate from compute queues
 155  * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
 156 #define THREADGROUP_SIZE     256 /* high numbers limit available VGPRs */
 157 #define THREADGROUPS_PER_CU  1   /* TGs to launch on 1 CU before going onto the next, max 8 */
 158 #define MAX_WAVES_PER_SH     0   /* no limit */
 159 #define INDEX_STORES_USE_SLC 1   /* don't cache indices if L2 is full */
 160 /* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */
 161 #define CULL_Z 0
 162 /* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */
 163 #define VERTEX_COUNTER_GDS_MODE 2
 164 #define GDS_SIZE_UNORDERED      (4 * 1024) /* only for the unordered GDS counter */
 165
 166 /* Grouping compute dispatches for small draw calls: How many primitives from multiple
 167  * draw calls to process by compute before signaling the gfx IB. This reduces the number
 168  * of EOP events + REWIND packets, because they decrease performance. */
 169 #define PRIMS_PER_BATCH (512 * 1024)
 170 /* Draw call splitting at the packet level. This allows signaling the gfx IB
 171  * for big draw calls sooner, but doesn't allow context flushes between packets.
 172  * Primitive restart is supported. Only implemented for ordered append. */
 173 #define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH
 174 /* If there is not enough ring buffer space for the current IB, split draw calls into
 175  * this number of primitives, so that we can flush the context and get free ring space. */
 176 #define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH
 177
 178 /* Derived values. */
 179 #define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
 180 #define SPLIT_PRIMS_PACKET_LEVEL                                                                   \
 181    (VERTEX_COUNTER_GDS_MODE == 2 ? SPLIT_PRIMS_PACKET_LEVEL_VALUE                                  \
 182                                  : UINT_MAX & ~(THREADGROUP_SIZE - 1))
 183
 184 #define REWIND_SIGNAL_BIT 0x80000000
 185 /* For emulating the rewind packet on CI. */
 186 #define FORCE_REWIND_EMULATION 0
 187
 188 void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
 189                                          unsigned *prim_discard_vertex_count_threshold,
 190                                          unsigned *index_ring_size_per_ib)
 191 {
 192    *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
 193
 194    if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */
 195        !sscreen->info.has_gds_ordered_append || sscreen->debug_flags & DBG(NO_PD) || is_aux_context)
 196       return;
 197
 198    /* TODO: enable this after the GDS kernel memory management is fixed */
 199    bool enable_on_pro_graphics_by_default = false;
 200
 201    if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) ||
 202        (enable_on_pro_graphics_by_default && sscreen->info.is_pro_graphics &&
 203         (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_HAWAII ||
 204          sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI ||
 205          sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 ||
 206          sscreen->info.family == CHIP_VEGA10 || sscreen->info.family == CHIP_VEGA20))) {
 207       *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
 208
 209       if (sscreen->debug_flags & DBG(ALWAYS_PD))
 210          *prim_discard_vertex_count_threshold = 0; /* always enable */
 211
 212       const uint32_t MB = 1024 * 1024;
 213       const uint64_t GB = 1024 * 1024 * 1024;
 214
 215       /* The total size is double this per context.
 216        * Greater numbers allow bigger gfx IBs.
 217        */
 218       if (sscreen->info.vram_size <= 2 * GB)
 219          *index_ring_size_per_ib = 64 * MB;
 220       else if (sscreen->info.vram_size <= 4 * GB)
 221          *index_ring_size_per_ib = 128 * MB;
 222       else
 223          *index_ring_size_per_ib = 256 * MB;
 224    }
 225 }
 226
 227 /* Opcode can be "add" or "swap". */
 228 static LLVMValueRef si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
 229                                            LLVMValueRef m0, LLVMValueRef value,
 230                                            unsigned ordered_count_index, bool release, bool done)
 231 {
 232    if (ctx->screen->info.chip_class >= GFX10)
 233       ordered_count_index |= 1 << 24; /* number of dwords == 1 */
 234
 235    LLVMValueRef args[] = {
 236       LLVMBuildIntToPtr(ctx->ac.builder, m0, LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""),
 237       value,
 238       LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
 239       ctx->ac.i32_0,                                             /* scope */
 240       ctx->ac.i1false,                                           /* volatile */
 241       LLVMConstInt(ctx->ac.i32, ordered_count_index, 0),
 242       LLVMConstInt(ctx->ac.i1, release, 0),
 243       LLVMConstInt(ctx->ac.i1, done, 0),
 244    };
 245
 246    char intrinsic[64];
 247    snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
 248    return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0);
 249 }
 250
 251 static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
 252 {
 253    uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
 254    ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");
 255    ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");
 256    return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
 257                             LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");
 258 }
 259
 260 struct si_thread0_section {
 261    struct si_shader_context *ctx;
 262    LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
 263    LLVMValueRef saved_exec;
 264 };
 265
 266 /* Enter a section that only executes on thread 0. */
 267 static void si_enter_thread0_section(struct si_shader_context *ctx,
 268                                      struct si_thread0_section *section, LLVMValueRef thread_id)
 269 {
 270    section->ctx = ctx;
 271    section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");
 272
 273    /* This IF has 4 instructions:
 274     *   v_and_b32_e32 v, 63, v         ; get the thread ID
 275     *   v_cmp_eq_u32_e32 vcc, 0, v     ; thread ID == 0
 276     *   s_and_saveexec_b64 s, vcc
 277     *   s_cbranch_execz BB0_4
 278     *
 279     * It could just be s_and_saveexec_b64 s, 1.
 280     */
 281    ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, ctx->ac.i32_0, ""),
 282                  12601);
 283 }
 284
 285 /* Exit a section that only executes on thread 0 and broadcast the result
 286  * to all threads. */
 287 static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValueRef *result)
 288 {
 289    struct si_shader_context *ctx = section->ctx;
 290
 291    LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
 292
 293    ac_build_endif(&ctx->ac, 12601);
 294
 295    /* Broadcast the result from thread 0 to all threads. */
 296    *result =
 297       ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
 298 }
 299
 300 void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
 301 {
 302    struct si_shader_key *key = &ctx->shader->key;
 303    LLVMBuilderRef builder = ctx->ac.builder;
 304    LLVMValueRef vs = ctx->main_fn;
 305
 306    /* Always inline the VS function. */
 307    ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
 308    LLVMSetLinkage(vs, LLVMPrivateLinkage);
 309
 310    enum ac_arg_type const_desc_type;
 311    if (ctx->shader->selector->info.const_buffers_declared == 1 &&
 312        ctx->shader->selector->info.shader_buffers_declared == 0)
 313       const_desc_type = AC_ARG_CONST_FLOAT_PTR;
 314    else
 315       const_desc_type = AC_ARG_CONST_DESC_PTR;
 316
 317    memset(&ctx->args, 0, sizeof(ctx->args));
 318
 319    struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
 320    struct ac_arg param_vb_desc, param_const_desc;
 321    struct ac_arg param_base_vertex, param_start_instance;
 322    struct ac_arg param_block_id, param_local_id, param_ordered_wave_id;
 323    struct ac_arg param_restart_index, param_smallprim_precision;
 324    struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
 325    struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr;
 326
 327    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
 328               &param_index_buffers_and_constants);
 329    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_counter);
 330    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_last_wave_prim_id);
 331    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_count_addr);
 332    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &param_vb_desc);
 333    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, &param_const_desc);
 334    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, &param_sampler_desc);
 335    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_base_vertex);
 336    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_instance);
 337    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_multiplier);
 338    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_terms);
 339    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_restart_index);
 340    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &param_smallprim_precision);
 341
 342    /* Block ID and thread ID inputs. */
 343    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_block_id);
 344    if (VERTEX_COUNTER_GDS_MODE == 2)
 345       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_ordered_wave_id);
 346    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &param_local_id);
 347
 348    /* Create the compute shader function. */
 349    unsigned old_type = ctx->type;
 350    gl_shader_stage old_stage = ctx->stage;
 351    ctx->type = PIPE_SHADER_COMPUTE;
 352    ctx->stage = MESA_SHADER_COMPUTE;
 353    si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
 354    ctx->type = old_type;
 355    ctx->stage = old_stage;
 356
 357    if (VERTEX_COUNTER_GDS_MODE == 2) {
 358       ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
 359    } else if (VERTEX_COUNTER_GDS_MODE == 1) {
 360       ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", GDS_SIZE_UNORDERED);
 361    }
 362
 363    /* Assemble parameters for VS. */
 364    LLVMValueRef vs_params[16];
 365    unsigned num_vs_params = 0;
 366    unsigned param_vertex_id, param_instance_id;
 367
 368    vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
 369    vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
 370    vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
 371    vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
 372    vs_params[num_vs_params++] =
 373       LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
 374    vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
 375    vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
 376    vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
 377    vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
 378
 379    vs_params[(param_vertex_id = num_vs_params++)] = NULL;   /* VertexID */
 380    vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
 381    vs_params[num_vs_params++] = ctx->ac.i32_0;              /* unused (PrimID) */
 382    vs_params[num_vs_params++] = ctx->ac.i32_0;              /* unused */
 383
 384    assert(num_vs_params <= ARRAY_SIZE(vs_params));
 385    assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
 386
 387    /* Load descriptors. (load 8 dwords at once) */
 388    LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
 389
 390    LLVMValueRef index_buffers_and_constants =
 391       ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
 392    tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
 393                               ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
 394    tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
 395
 396    for (unsigned i = 0; i < 8; i++)
 397       desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
 398
 399    input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
 400    output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
 401
 402    /* Compute PrimID and InstanceID. */
 403    LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
 404                                                  LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
 405                                                  ac_get_arg(&ctx->ac, param_local_id));
 406    LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
 407    LLVMValueRef instance_id = ctx->ac.i32_0;
 408
 409    if (key->opt.cs_instancing) {
 410       LLVMValueRef num_prims_udiv_terms = ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
 411       LLVMValueRef num_prims_udiv_multiplier =
 412          ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
 413       /* Unpack num_prims_udiv_terms. */
 414       LLVMValueRef post_shift =
 415          LLVMBuildAnd(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
 416       LLVMValueRef prims_per_instance =
 417          LLVMBuildLShr(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 5, 0), "");
 418       /* Divide the total prim_id by the number of prims per instance. */
 419       instance_id =
 420          ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, num_prims_udiv_multiplier, post_shift);
 421       /* Compute the remainder. */
 422       prim_id = LLVMBuildSub(builder, prim_id,
 423                              LLVMBuildMul(builder, instance_id, prims_per_instance, ""), "");
 424    }
 425
 426    /* Generate indices (like a non-indexed draw call). */
 427    LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
 428    unsigned vertices_per_prim = 3;
 429
 430    switch (key->opt.cs_prim_type) {
 431    case PIPE_PRIM_TRIANGLES:
 432       for (unsigned i = 0; i < 3; i++) {
 433          index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0),
 434                                   LLVMConstInt(ctx->ac.i32, i, 0));
 435       }
 436       break;
 437    case PIPE_PRIM_TRIANGLE_STRIP:
 438       for (unsigned i = 0; i < 3; i++) {
 439          index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), "");
 440       }
 441       break;
 442    case PIPE_PRIM_TRIANGLE_FAN:
 443       /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
 444        * and rasterizer as a normal triangle, so we need to put the provoking
 445        * vertex into the correct index variable and preserve orientation at the same time.
 446        * gl_VertexID is preserved, because it's equal to the index.
 447        */
 448       if (key->opt.cs_provoking_vertex_first) {
 449          index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
 450          index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
 451          index[2] = ctx->ac.i32_0;
 452       } else {
 453          index[0] = ctx->ac.i32_0;
 454          index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
 455          index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
 456       }
 457       break;
 458    default:
 459       unreachable("unexpected primitive type");
 460    }
 461
 462    /* Fetch indices. */
 463    if (key->opt.cs_indexed) {
 464       for (unsigned i = 0; i < 3; i++) {
 465          index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,
 466                                                 1, 0, true, false);
 467          index[i] = ac_to_integer(&ctx->ac, index[i]);
 468       }
 469    }
 470
 471    LLVMValueRef ordered_wave_id = NULL;
 472
 473    /* Extract the ordered wave ID. */
 474    if (VERTEX_COUNTER_GDS_MODE == 2) {
 475       ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id);
 476       ordered_wave_id =
 477          LLVMBuildLShr(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 6, 0), "");
 478       ordered_wave_id =
 479          LLVMBuildAnd(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 0xfff, 0), "");
 480    }
 481    LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
 482                                          LLVMConstInt(ctx->ac.i32, 63, 0), "");
 483
 484    /* Every other triangle in a strip has a reversed vertex order, so we
 485     * need to swap vertices of odd primitives to get the correct primitive
 486     * orientation when converting triangle strips to triangles. Primitive
 487     * restart complicates it, because a strip can start anywhere.
 488     */
 489    LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
 490    LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
 491
 492    if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
 493       /* Without primitive restart, odd primitives have reversed orientation.
 494        * Only primitive restart can flip it with respect to the first vertex
 495        * of the draw call.
 496        */
 497       LLVMValueRef first_is_odd = ctx->ac.i1false;
 498
 499       /* Handle primitive restart. */
 500       if (key->opt.cs_primitive_restart) {
 501          /* Get the GDS primitive restart continue flag and clear
 502           * the flag in vertex_counter. This flag is used when the draw
 503           * call was split and we need to load the primitive orientation
 504           * flag from GDS for the first wave too.
 505           */
 506          LLVMValueRef gds_prim_restart_continue =
 507             LLVMBuildLShr(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 31, 0), "");
 508          gds_prim_restart_continue =
 509             LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, "");
 510          vertex_counter =
 511             LLVMBuildAnd(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), "");
 512
 513          LLVMValueRef index0_is_reset;
 514
 515          for (unsigned i = 0; i < 3; i++) {
 516             LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
 517                                                    ac_get_arg(&ctx->ac, param_restart_index), "");
 518             if (i == 0)
 519                index0_is_reset = LLVMBuildNot(builder, not_reset, "");
 520             prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, not_reset, "");
 521          }
 522
 523          /* If the previous waves flip the primitive orientation
 524           * of the current triangle strip, it will be stored in GDS.
 525           *
 526           * Sometimes the correct orientation is not needed, in which case
 527           * we don't need to execute this.
 528           */
 529          if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
 530             /* If there are reset indices in this wave, get the thread index
 531              * where the most recent strip starts relative to each thread.
 532              */
 533             LLVMValueRef preceding_threads_mask =
 534                LLVMBuildSub(builder,
 535                             LLVMBuildShl(builder, ctx->ac.i64_1,
 536                                          LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""),
 537                             ctx->ac.i64_1, "");
 538
 539             LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
 540             LLVMValueRef preceding_reset_threadmask =
 541                LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
 542             LLVMValueRef strip_start = ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
 543             strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, "");
 544
 545             /* This flips the orientatino based on reset indices within this wave only. */
 546             first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, "");
 547
 548             LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
 549             LLVMValueRef is_first_wave, current_wave_resets_index;
 550
 551             /* Get the thread index where the last strip starts in this wave.
 552              *
 553              * If the last strip doesn't start in this wave, the thread index
 554              * will be 0.
 555              *
 556              * If the last strip starts in the next wave, the thread index will
 557              * be 64.
 558              */
 559             last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
 560             last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, "");
 561
 562             struct si_thread0_section section;
 563             si_enter_thread0_section(ctx, &section, thread_id);
 564
 565             /* This must be done in the thread 0 section, because
 566              * we expect PrimID to be 0 for the whole first wave
 567              * in this expression.
 568              *
 569              * NOTE: This will need to be different if we wanna support
 570              * instancing with primitive restart.
 571              */
 572             is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, "");
 573             is_first_wave = LLVMBuildAnd(builder, is_first_wave,
 574                                          LLVMBuildNot(builder, gds_prim_restart_continue, ""), "");
 575             current_wave_resets_index =
 576                LLVMBuildICmp(builder, LLVMIntNE, last_strip_start, ctx->ac.i32_0, "");
 577
 578             ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state");
 579
 580             /* Save the last strip start primitive index in GDS and read
 581              * the value that previous waves stored.
 582              *
 583              * if (is_first_wave || current_wave_resets_strip)
 584              *    // Read the value that previous waves stored and store a new one.
 585              *    first_is_odd = ds.ordered.swap(last_strip_start);
 586              * else
 587              *    // Just read the value that previous waves stored.
 588              *    first_is_odd = ds.ordered.add(0);
 589              */
 590             ac_build_ifcc(
 591                &ctx->ac, LLVMBuildOr(builder, is_first_wave, current_wave_resets_index, ""), 12602);
 592             {
 593                /* The GDS address is always 0 with ordered append. */
 594                tmp = si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, last_strip_start, 1, true,
 595                                             false);
 596                LLVMBuildStore(builder, tmp, ret);
 597             }
 598             ac_build_else(&ctx->ac, 12603);
 599             {
 600                /* Just read the value from GDS. */
 601                tmp = si_build_ds_ordered_op(ctx, "add", ordered_wave_id, ctx->ac.i32_0, 1, true,
 602                                             false);
 603                LLVMBuildStore(builder, tmp, ret);
 604             }
 605             ac_build_endif(&ctx->ac, 12602);
 606
 607             prev_wave_state = LLVMBuildLoad(builder, ret, "");
 608             /* Ignore the return value if this is the first wave. */
 609             prev_wave_state =
 610                LLVMBuildSelect(builder, is_first_wave, ctx->ac.i32_0, prev_wave_state, "");
 611             si_exit_thread0_section(&section, &prev_wave_state);
 612             prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, "");
 613
 614             /* If the strip start appears to be on thread 0 for the current primitive
 615              * (meaning the reset index is not present in this wave and might have
 616              * appeared in previous waves), use the value from GDS to determine
 617              * primitive orientation.
 618              *
 619              * If the strip start is in this wave for the current primitive, use
 620              * the value from the current wave to determine primitive orientation.
 621              */
 622             LLVMValueRef strip_start_is0 =
 623                LLVMBuildICmp(builder, LLVMIntEQ, strip_start, ctx->ac.i32_0, "");
 624             first_is_odd =
 625                LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, first_is_odd, "");
 626          }
 627       }
 628       /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
 629       LLVMValueRef prim_is_odd = LLVMBuildXor(
 630          builder, first_is_odd, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
 631
 632       /* Convert triangle strip indices to triangle indices. */
 633       ac_build_triangle_strip_indices_to_triangle(
 634          &ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
 635          index);
 636    }
 637
 638    /* Execute the vertex shader for each vertex to get vertex positions. */
 639    LLVMValueRef pos[3][4];
 640    for (unsigned i = 0; i < vertices_per_prim; i++) {
 641       vs_params[param_vertex_id] = index[i];
 642       vs_params[param_instance_id] = instance_id;
 643
 644       LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
 645       for (unsigned chan = 0; chan < 4; chan++)
 646          pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
 647    }
 648
 649    /* Divide XYZ by W. */
 650    for (unsigned i = 0; i < vertices_per_prim; i++) {
 651       for (unsigned chan = 0; chan < 3; chan++)
 652          pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
 653    }
 654
 655    /* Load the viewport state. */
 656    LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
 657                                              LLVMConstInt(ctx->ac.i32, 2, 0));
 658    vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
 659    LLVMValueRef vp_scale[2], vp_translate[2];
 660    vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
 661    vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
 662    vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
 663    vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
 664
 665    /* Do culling. */
 666    struct ac_cull_options options = {};
 667    options.cull_front = key->opt.cs_cull_front;
 668    options.cull_back = key->opt.cs_cull_back;
 669    options.cull_view_xy = true;
 670    options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
 671    options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
 672    options.cull_small_prims = true;
 673    options.cull_zero_area = true;
 674    options.cull_w = true;
 675    options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
 676
 677    LLVMValueRef accepted =
 678       ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate,
 679                        ac_get_arg(&ctx->ac, param_smallprim_precision), &options);
 680
 681    ac_build_optimization_barrier(&ctx->ac, &accepted);
 682    LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
 683
 684    /* Count the number of active threads by doing bitcount(accepted). */
 685    LLVMValueRef num_prims_accepted = ac_build_intrinsic(
 686       &ctx->ac, "llvm.ctpop.i64", ctx->ac.i64, &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
 687    num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
 688
 689    LLVMValueRef start;
 690
 691    /* Execute atomic_add on the vertex count. */
 692    struct si_thread0_section section;
 693    si_enter_thread0_section(ctx, &section, thread_id);
 694    {
 695       if (VERTEX_COUNTER_GDS_MODE == 0) {
 696          LLVMValueRef num_indices = LLVMBuildMul(
 697             builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
 698          vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
 699          start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
 700                                     LLVMAtomicOrderingMonotonic, false);
 701       } else if (VERTEX_COUNTER_GDS_MODE == 1) {
 702          LLVMValueRef num_indices = LLVMBuildMul(
 703             builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
 704          vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
 705                                             LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), "");
 706          start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
 707                                     LLVMAtomicOrderingMonotonic, false);
 708       } else if (VERTEX_COUNTER_GDS_MODE == 2) {
 709          LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
 710
 711          /* If the draw call was split into multiple subdraws, each using
 712           * a separate draw packet, we need to start counting from 0 for
 713           * the first compute wave of the subdraw.
 714           *
 715           * vertex_counter contains the primitive ID of the first thread
 716           * in the first wave.
 717           *
 718           * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
 719           */
 720          LLVMValueRef is_first_wave =
 721             LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, vertex_counter, "");
 722
 723          /* Store the primitive count for ordered append, not vertex count.
 724           * The idea is to avoid GDS initialization via CP DMA. The shader
 725           * effectively stores the first count using "swap".
 726           *
 727           * if (first_wave) {
 728           *    ds.ordered.swap(num_prims_accepted); // store the first primitive count
 729           *    previous = 0;
 730           * } else {
 731           *    previous = ds.ordered.add(num_prims_accepted) // add the primitive count
 732           * }
 733           */
 734          ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
 735          {
 736             /* The GDS address is always 0 with ordered append. */
 737             si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, num_prims_accepted, 0, true, true);
 738             LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store);
 739          }
 740          ac_build_else(&ctx->ac, 12605);
 741          {
 742             LLVMBuildStore(builder,
 743                            si_build_ds_ordered_op(ctx, "add", ordered_wave_id, num_prims_accepted,
 744                                                   0, true, true),
 745                            tmp_store);
 746          }
 747          ac_build_endif(&ctx->ac, 12604);
 748
 749          start = LLVMBuildLoad(builder, tmp_store, "");
 750       }
 751    }
 752    si_exit_thread0_section(&section, &start);
 753
 754    /* Write the final vertex count to memory. An EOS/EOP event could do this,
 755     * but those events are super slow and should be avoided if performance
 756     * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
 757     * event like this.
 758     */
 759    if (VERTEX_COUNTER_GDS_MODE == 2) {
 760       ac_build_ifcc(&ctx->ac,
 761                     LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
 762                                   ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""),
 763                     12606);
 764       LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
 765       count = LLVMBuildMul(builder, count, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
 766
 767       /* GFX8 needs to disable caching, so that the CP can see the stored value.
 768        * MTYPE=3 bypasses TC L2.
 769        */
 770       if (ctx->screen->info.chip_class <= GFX8) {
 771          LLVMValueRef desc[] = {
 772             ac_get_arg(&ctx->ac, param_vertex_count_addr),
 773             LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
 774             LLVMConstInt(ctx->ac.i32, 4, 0),
 775             LLVMConstInt(
 776                ctx->ac.i32,
 777                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | S_008F0C_MTYPE(3 /* uncached */),
 778                0),
 779          };
 780          LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
 781          ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0, ctx->ac.i32_0, 0,
 782                                      ac_glc | ac_slc);
 783       } else {
 784          LLVMBuildStore(
 785             builder, count,
 786             si_expand_32bit_pointer(ctx, ac_get_arg(&ctx->ac, param_vertex_count_addr)));
 787       }
 788       ac_build_endif(&ctx->ac, 12606);
 789    } else {
 790       /* For unordered modes that increment a vertex count instead of
 791        * primitive count, convert it into the primitive index.
 792        */
 793       start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
 794    }
 795
 796    /* Now we need to store the indices of accepted primitives into
 797     * the output index buffer.
 798     */
 799    ac_build_ifcc(&ctx->ac, accepted, 16607);
 800    {
 801       /* Get the number of bits set before the index of this thread. */
 802       LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
 803
 804       /* We have lowered instancing. Pack the instance ID into vertex ID. */
 805       if (key->opt.cs_instancing) {
 806          instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
 807
 808          for (unsigned i = 0; i < vertices_per_prim; i++)
 809             index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
 810       }
 811
 812       if (VERTEX_COUNTER_GDS_MODE == 2) {
 813          /* vertex_counter contains the first primitive ID
 814           * for this dispatch. If the draw call was split into
 815           * multiple subdraws, the first primitive ID is > 0
 816           * for subsequent subdraws. Each subdraw uses a different
 817           * portion of the output index buffer. Offset the store
 818           * vindex by the first primitive ID to get the correct
 819           * store address for the subdraw.
 820           */
 821          start = LLVMBuildAdd(builder, start, vertex_counter, "");
 822       }
 823
 824       /* Write indices for accepted primitives. */
 825       LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
 826       LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
 827
 828       if (!ac_has_vec3_support(ctx->ac.chip_class, true))
 829          vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
 830
 831       ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0,
 832                                    ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
 833    }
 834    ac_build_endif(&ctx->ac, 16607);
 835
 836    LLVMBuildRetVoid(builder);
 837 }
 838
 839 /* Return false if the shader isn't ready. */
 840 static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
 841                                              const struct pipe_draw_info *info,
 842                                              bool primitive_restart)
 843 {
 844    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 845    struct si_shader_key key;
 846
 847    /* Primitive restart needs ordered counters. */
 848    assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
 849    assert(!primitive_restart || info->instance_count == 1);
 850
 851    memset(&key, 0, sizeof(key));
 852    si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
 853    assert(!key.part.vs.prolog.instance_divisor_is_fetched);
 854
 855    key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
 856    key.opt.vs_as_prim_discard_cs = 1;
 857    key.opt.cs_prim_type = info->mode;
 858    key.opt.cs_indexed = info->index_size != 0;
 859    key.opt.cs_instancing = info->instance_count > 1;
 860    key.opt.cs_primitive_restart = primitive_restart;
 861    key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
 862
 863    /* Primitive restart with triangle strips needs to preserve primitive
 864     * orientation for cases where front and back primitive orientation matters.
 865     */
 866    if (primitive_restart) {
 867       struct si_shader_selector *ps = sctx->ps_shader.cso;
 868
 869       key.opt.cs_need_correct_orientation = rs->cull_front != rs->cull_back ||
 870                                             ps->info.uses_frontface ||
 871                                             (rs->two_side && ps->info.colors_read);
 872    }
 873
 874    if (rs->rasterizer_discard) {
 875       /* Just for performance testing and analysis of trivial bottlenecks.
 876        * This should result in a very short compute shader. */
 877       key.opt.cs_cull_front = 1;
 878       key.opt.cs_cull_back = 1;
 879    } else {
 880       key.opt.cs_cull_front = sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
 881       key.opt.cs_cull_back = sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
 882    }
 883
 884    if (!rs->depth_clamp_any && CULL_Z) {
 885       key.opt.cs_cull_z = 1;
 886       key.opt.cs_halfz_clip_space = rs->clip_halfz;
 887    }
 888
 889    sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
 890    sctx->cs_prim_discard_state.current = NULL;
 891
 892    if (!sctx->compiler.passes)
 893       si_init_compiler(sctx->screen, &sctx->compiler);
 894
 895    struct si_compiler_ctx_state compiler_state;
 896    compiler_state.compiler = &sctx->compiler;
 897    compiler_state.debug = sctx->debug;
 898    compiler_state.is_debug_context = sctx->is_debug;
 899
 900    return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, &compiler_state,
 901                                     &key, -1, true) == 0 &&
 902           /* Disallow compute shaders using the scratch buffer. */
 903           sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
 904 }
 905
 906 static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
 907 {
 908    if (sctx->index_ring)
 909       return true;
 910
 911    if (!sctx->prim_discard_compute_cs) {
 912       struct radeon_winsys *ws = sctx->ws;
 913       unsigned gds_size =
 914          VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED : VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
 915       unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
 916
 917       if (gds_size) {
 918          sctx->gds = ws->buffer_create(ws, gds_size, 4, RADEON_DOMAIN_GDS, 0);
 919          if (!sctx->gds)
 920             return false;
 921
 922          ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0);
 923       }
 924       if (num_oa_counters) {
 925          assert(gds_size);
 926          sctx->gds_oa = ws->buffer_create(ws, num_oa_counters, 1, RADEON_DOMAIN_OA, 0);
 927          if (!sctx->gds_oa)
 928             return false;
 929
 930          ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0);
 931       }
 932
 933       sctx->prim_discard_compute_cs =
 934          ws->cs_add_parallel_compute_ib(sctx->gfx_cs, num_oa_counters > 0);
 935       if (!sctx->prim_discard_compute_cs)
 936          return false;
 937    }
 938
 939    if (!sctx->index_ring) {
 940       sctx->index_ring = si_aligned_buffer_create(
 941          sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
 942          sctx->index_ring_size_per_ib * 2, sctx->screen->info.pte_fragment_size);
 943       if (!sctx->index_ring)
 944          return false;
 945    }
 946    return true;
 947 }
 948
 949 static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
 950 {
 951    return sctx->index_ring_offset +
 952              align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
 953           sctx->index_ring_size_per_ib;
 954 }
 955
 956 enum si_prim_discard_outcome
 957 si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
 958                                       bool primitive_restart)
 959 {
 960    /* If the compute shader compilation isn't finished, this returns false. */
 961    if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
 962       return SI_PRIM_DISCARD_DISABLED;
 963
 964    if (!si_initialize_prim_discard_cmdbuf(sctx))
 965       return SI_PRIM_DISCARD_DISABLED;
 966
 967    struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
 968    unsigned prim = info->mode;
 969    unsigned count = info->count;
 970    unsigned instance_count = info->instance_count;
 971    unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
 972    unsigned num_prims = num_prims_per_instance * instance_count;
 973    unsigned out_indexbuf_size = num_prims * 12;
 974    bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
 975    const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
 976
 977    /* Split draws at the draw call level if the ring is full. This makes
 978     * better use of the ring space.
 979     */
 980    if (ring_full && num_prims > split_prims_draw_level &&
 981        instance_count == 1 && /* TODO: support splitting instanced draws */
 982        (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
 983       /* Split draws. */
 984       struct pipe_draw_info split_draw = *info;
 985       split_draw.primitive_restart = primitive_restart;
 986
 987       unsigned base_start = split_draw.start;
 988
 989       if (prim == PIPE_PRIM_TRIANGLES) {
 990          unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
 991          assert(vert_count_per_subdraw < count);
 992
 993          for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
 994             split_draw.start = base_start + start;
 995             split_draw.count = MIN2(count - start, vert_count_per_subdraw);
 996
 997             sctx->b.draw_vbo(&sctx->b, &split_draw);
 998          }
 999       } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
1000          /* No primitive pair can be split, because strips reverse orientation
1001           * for odd primitives. */
1002          STATIC_ASSERT(split_prims_draw_level % 2 == 0);
1003
1004          unsigned vert_count_per_subdraw = split_prims_draw_level;
1005
1006          for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
1007             split_draw.start = base_start + start;
1008             split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
1009
1010             sctx->b.draw_vbo(&sctx->b, &split_draw);
1011
1012             if (start == 0 && primitive_restart &&
1013                 sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
1014                sctx->preserve_prim_restart_gds_at_flush = true;
1015          }
1016          sctx->preserve_prim_restart_gds_at_flush = false;
1017       } else {
1018          assert(0);
1019       }
1020
1021       return SI_PRIM_DISCARD_DRAW_SPLIT;
1022    }
1023
1024    /* Just quit if the draw call doesn't fit into the ring and can't be split. */
1025    if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
1026       if (SI_PRIM_DISCARD_DEBUG)
1027          puts("PD failed: draw call too big, can't be split");
1028       return SI_PRIM_DISCARD_DISABLED;
1029    }
1030
1031    unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
1032    unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
1033                               24 * (num_subdraws - 1) + /* subdraws */
1034                               30;                       /* leave some space at the end */
1035    unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
1036
1037    if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
1038       need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
1039    else
1040       need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
1041
1042    if (ring_full ||
1043        (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
1044        !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
1045       /* If the current IB is empty but the size is too small, add a NOP
1046        * packet to force a flush and get a bigger IB.
1047        */
1048       if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
1049           gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
1050          radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
1051          radeon_emit(gfx_cs, 0);
1052       }
1053
1054       si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
1055    }
1056
1057    /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
1058    struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
1059    ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
1060    assert(compute_has_space);
1061    assert(si_check_ring_space(sctx, out_indexbuf_size));
1062    return SI_PRIM_DISCARD_ENABLED;
1063 }
1064
1065 void si_compute_signal_gfx(struct si_context *sctx)
1066 {
1067    struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
1068    unsigned writeback_L2_flags = 0;
1069
1070    /* The writeback L2 flags vary with each chip generation. */
1071    /* CI needs to flush vertex indices to memory. */
1072    if (sctx->chip_class <= GFX7)
1073       writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
1074    else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
1075       writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
1076
1077    if (!sctx->compute_num_prims_in_batch)
1078       return;
1079
1080    assert(sctx->compute_rewind_va);
1081
1082    /* After the queued dispatches are done and vertex counts are written to
1083     * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
1084     * the dispatches to finish, it only adds the CS_DONE event into the event
1085     * queue.
1086     */
1087    si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
1088                      sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
1089                      writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : EOP_INT_SEL_NONE,
1090                      EOP_DATA_SEL_VALUE_32BIT, NULL,
1091                      sctx->compute_rewind_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
1092                      REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
1093                      SI_NOT_QUERY);
1094
1095    sctx->compute_rewind_va = 0;
1096    sctx->compute_num_prims_in_batch = 0;
1097 }
1098
1099 /* Dispatch a primitive discard compute shader. */
1100 void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
1101                                           const struct pipe_draw_info *info, unsigned index_size,
1102                                           unsigned base_vertex, uint64_t input_indexbuf_va,
1103                                           unsigned input_indexbuf_num_elements)
1104 {
1105    struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
1106    struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
1107    unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
1108    if (!num_prims_per_instance)
1109       return;
1110
1111    unsigned num_prims = num_prims_per_instance * info->instance_count;
1112    unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
1113
1114    switch (info->mode) {
1115    case PIPE_PRIM_TRIANGLES:
1116    case PIPE_PRIM_TRIANGLE_STRIP:
1117    case PIPE_PRIM_TRIANGLE_FAN:
1118       vertices_per_prim = 3;
1119       output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
1120       gfx10_output_indexbuf_format = V_008F0C_IMG_FORMAT_32_32_32_UINT;
1121       break;
1122    default:
1123       unreachable("unsupported primitive type");
1124       return;
1125    }
1126
1127    unsigned out_indexbuf_offset;
1128    uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
1129    bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
1130
1131    /* Initialize the compute IB if it's empty. */
1132    if (!sctx->prim_discard_compute_ib_initialized) {
1133       /* 1) State initialization. */
1134       sctx->compute_gds_offset = 0;
1135       sctx->compute_ib_last_shader = NULL;
1136
1137       if (sctx->last_ib_barrier_fence) {
1138          assert(!sctx->last_ib_barrier_buf);
1139          sctx->ws->cs_add_fence_dependency(gfx_cs, sctx->last_ib_barrier_fence,
1140                                            RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
1141       }
1142
1143       /* 2) IB initialization. */
1144
1145       /* This needs to be done at the beginning of IBs due to possible
1146        * TTM buffer moves in the kernel.
1147        */
1148       if (sctx->chip_class >= GFX10) {
1149          radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
1150          radeon_emit(cs, 0);          /* CP_COHER_CNTL */
1151          radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
1152          radeon_emit(cs, 0xffffff);   /* CP_COHER_SIZE_HI */
1153          radeon_emit(cs, 0);          /* CP_COHER_BASE */
1154          radeon_emit(cs, 0);          /* CP_COHER_BASE_HI */
1155          radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
1156          radeon_emit(cs,              /* GCR_CNTL */
1157                      S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) |
1158                         S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) |
1159                         S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD));
1160       } else {
1161          si_emit_surface_sync(sctx, cs,
1162                               S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
1163                                  S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
1164                                  S_0085F0_SH_ICACHE_ACTION_ENA(1) |
1165                                  S_0085F0_SH_KCACHE_ACTION_ENA(1));
1166       }
1167
1168       /* Restore the GDS prim restart counter if needed. */
1169       if (sctx->preserve_prim_restart_gds_at_flush) {
1170          si_cp_copy_data(sctx, cs, COPY_DATA_GDS, NULL, 4, COPY_DATA_SRC_MEM,
1171                          sctx->wait_mem_scratch, 4);
1172       }
1173
1174       si_emit_initial_compute_regs(sctx, cs);
1175
1176       radeon_set_sh_reg(
1177          cs, R_00B860_COMPUTE_TMPRING_SIZE,
1178          S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */
1179
1180       /* Only 1D grids are launched. */
1181       radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
1182       radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | S_00B820_NUM_THREAD_PARTIAL(1));
1183       radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | S_00B824_NUM_THREAD_PARTIAL(1));
1184
1185       radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
1186       radeon_emit(cs, 0);
1187       radeon_emit(cs, 0);
1188
1189       /* Disable ordered alloc for OA resources. */
1190       for (unsigned i = 0; i < 2; i++) {
1191          radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
1192          radeon_emit(cs, S_031074_INDEX(i));
1193          radeon_emit(cs, 0);
1194          radeon_emit(cs, S_03107C_ENABLE(0));
1195       }
1196
1197       if (sctx->last_ib_barrier_buf) {
1198          assert(!sctx->last_ib_barrier_fence);
1199          radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ,
1200                                    RADEON_PRIO_FENCE);
1201          si_cp_wait_mem(sctx, cs,
1202                         sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset,
1203                         1, 1, WAIT_REG_MEM_EQUAL);
1204       }
1205
1206       sctx->prim_discard_compute_ib_initialized = true;
1207    }
1208
1209    /* Allocate the output index buffer. */
1210    output_indexbuf_size = align(output_indexbuf_size, sctx->screen->info.tcc_cache_line_size);
1211    assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
1212    out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
1213    sctx->index_ring_offset += output_indexbuf_size;
1214
1215    radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
1216                              RADEON_PRIO_SHADER_RW_BUFFER);
1217    uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
1218
1219    /* Prepare index buffer descriptors. */
1220    struct si_resource *indexbuf_desc = NULL;
1221    unsigned indexbuf_desc_offset;
1222    unsigned desc_size = 12 * 4;
1223    uint32_t *desc;
1224
1225    u_upload_alloc(sctx->b.const_uploader, 0, desc_size, si_optimal_tcc_alignment(sctx, desc_size),
1226                   &indexbuf_desc_offset, (struct pipe_resource **)&indexbuf_desc, (void **)&desc);
1227    radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
1228                              RADEON_PRIO_DESCRIPTORS);
1229
1230    /* Input index buffer. */
1231    desc[0] = input_indexbuf_va;
1232    desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size);
1233    desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
1234
1235    if (sctx->chip_class >= GFX10) {
1236       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1237                 S_008F0C_FORMAT(index_size == 1 ? V_008F0C_IMG_FORMAT_8_UINT
1238                                                 : index_size == 2 ? V_008F0C_IMG_FORMAT_16_UINT
1239                                                                   : V_008F0C_IMG_FORMAT_32_UINT) |
1240                 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
1241                 S_008F0C_RESOURCE_LEVEL(1);
1242    } else {
1243       desc[3] =
1244          S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
1245          S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8
1246                                               : index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16
1247                                                                 : V_008F0C_BUF_DATA_FORMAT_32);
1248    }
1249
1250    /* Output index buffer. */
1251    desc[4] = out_indexbuf_va;
1252    desc[5] =
1253       S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4);
1254    desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
1255
1256    if (sctx->chip_class >= GFX10) {
1257       desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1258                 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
1259                 S_008F0C_FORMAT(gfx10_output_indexbuf_format) |
1260                 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
1261                 S_008F0C_RESOURCE_LEVEL(1);
1262    } else {
1263       desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1264                 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
1265                 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
1266                 S_008F0C_DATA_FORMAT(output_indexbuf_format);
1267    }
1268
1269    /* Viewport state. */
1270    struct si_small_prim_cull_info cull_info;
1271    si_get_small_prim_cull_info(sctx, &cull_info);
1272
1273    desc[8] = fui(cull_info.scale[0]);
1274    desc[9] = fui(cull_info.scale[1]);
1275    desc[10] = fui(cull_info.translate[0]);
1276    desc[11] = fui(cull_info.translate[1]);
1277
1278    /* Better subpixel precision increases the efficiency of small
1279     * primitive culling. */
1280    unsigned num_samples = sctx->framebuffer.nr_samples;
1281    unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
1282    float small_prim_cull_precision;
1283
1284    if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
1285       small_prim_cull_precision = num_samples / 4096.0;
1286    else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
1287       small_prim_cull_precision = num_samples / 1024.0;
1288    else
1289       small_prim_cull_precision = num_samples / 256.0;
1290
1291    /* Set user data SGPRs. */
1292    /* This can't be greater than 14 if we want the fastest launch rate. */
1293    unsigned user_sgprs = 13;
1294
1295    uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
1296    unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
1297    unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
1298    uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
1299    uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
1300    uint64_t vb_desc_va = sctx->vb_descriptors_buffer
1301                             ? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset
1302                             : 0;
1303    unsigned gds_offset, gds_size;
1304    struct si_fast_udiv_info32 num_prims_udiv = {};
1305
1306    if (info->instance_count > 1)
1307       num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
1308
1309    /* Limitations on how these two are packed in the user SGPR. */
1310    assert(num_prims_udiv.post_shift < 32);
1311    assert(num_prims_per_instance < 1 << 27);
1312
1313    si_resource_reference(&indexbuf_desc, NULL);
1314
1315    bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
1316
1317    if (VERTEX_COUNTER_GDS_MODE == 1) {
1318       gds_offset = sctx->compute_gds_offset;
1319       gds_size = primitive_restart ? 8 : 4;
1320       sctx->compute_gds_offset += gds_size;
1321
1322       /* Reset the counters in GDS for the first dispatch using WRITE_DATA.
1323        * The remainder of the GDS will be cleared after the dispatch packet
1324        * in parallel with compute shaders.
1325        */
1326       if (first_dispatch) {
1327          radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size / 4, 0));
1328          radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
1329          radeon_emit(cs, gds_offset);
1330          radeon_emit(cs, 0);
1331          radeon_emit(cs, 0); /* value to write */
1332          if (gds_size == 8)
1333             radeon_emit(cs, 0);
1334       }
1335    }
1336
1337    /* Set shader registers. */
1338    struct si_shader *shader = sctx->cs_prim_discard_state.current;
1339
1340    if (shader != sctx->compute_ib_last_shader) {
1341       radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
1342                                 RADEON_PRIO_SHADER_BINARY);
1343       uint64_t shader_va = shader->bo->gpu_address;
1344
1345       assert(shader->config.scratch_bytes_per_wave == 0);
1346       assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
1347
1348       radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
1349       radeon_emit(cs, shader_va >> 8);
1350       radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
1351
1352       radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
1353       radeon_emit(
1354          cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
1355                 S_00B848_SGPRS(sctx->chip_class <= GFX9 ? (shader->config.num_sgprs - 1) / 8 : 0) |
1356                 S_00B848_FLOAT_MODE(shader->config.float_mode) | S_00B848_DX10_CLAMP(1) |
1357                 S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |
1358                 S_00B848_WGP_MODE(sctx->chip_class >= GFX10));
1359       radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | S_00B84C_USER_SGPR(user_sgprs) |
1360                          S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
1361                          S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
1362                          S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
1363                          S_00B84C_LDS_SIZE(shader->config.lds_size));
1364
1365       radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
1366                         ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG,
1367                                                        MAX_WAVES_PER_SH, THREADGROUPS_PER_CU));
1368       sctx->compute_ib_last_shader = shader;
1369    }
1370
1371    STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
1372
1373    /* Big draw calls are split into smaller dispatches and draw packets. */
1374    for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
1375       unsigned num_subdraw_prims;
1376
1377       if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
1378          num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
1379       else
1380          num_subdraw_prims = num_prims - start_prim;
1381
1382       /* Small dispatches are executed back to back until a specific primitive
1383        * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
1384        * to start drawing the batch. This batching adds latency to the gfx IB,
1385        * but CS_DONE and REWIND are too slow.
1386        */
1387       if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
1388          si_compute_signal_gfx(sctx);
1389
1390       if (sctx->compute_num_prims_in_batch == 0) {
1391          assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
1392          sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
1393
1394          if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
1395             radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
1396             radeon_emit(gfx_cs, 0);
1397
1398             si_cp_wait_mem(
1399                sctx, gfx_cs,
1400                sctx->compute_rewind_va | (uint64_t)sctx->screen->info.address32_hi << 32,
1401                REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
1402
1403             /* Use INDIRECT_BUFFER to chain to a different buffer
1404              * to discard the CP prefetch cache.
1405              */
1406             sctx->ws->cs_check_space(gfx_cs, 0, true);
1407          } else {
1408             radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
1409             radeon_emit(gfx_cs, 0);
1410          }
1411       }
1412
1413       sctx->compute_num_prims_in_batch += num_subdraw_prims;
1414
1415       uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
1416       uint64_t index_va = out_indexbuf_va + start_prim * 12;
1417
1418       /* Emit the draw packet into the gfx IB. */
1419       radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
1420       radeon_emit(gfx_cs, num_prims * vertices_per_prim);
1421       radeon_emit(gfx_cs, index_va);
1422       radeon_emit(gfx_cs, index_va >> 32);
1423       radeon_emit(gfx_cs, 0);
1424       radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
1425
1426       /* Continue with the compute IB. */
1427       if (start_prim == 0) {
1428          uint32_t gds_prim_restart_continue_bit = 0;
1429
1430          if (sctx->preserve_prim_restart_gds_at_flush) {
1431             assert(primitive_restart && info->mode == PIPE_PRIM_TRIANGLE_STRIP);
1432             assert(start_prim < 1 << 31);
1433             gds_prim_restart_continue_bit = 1 << 31;
1434          }
1435
1436          radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
1437          radeon_emit(cs, index_buffers_va);
1438          radeon_emit(cs, VERTEX_COUNTER_GDS_MODE == 0
1439                             ? count_va
1440                             : VERTEX_COUNTER_GDS_MODE == 1
1441                                  ? gds_offset
1442                                  : start_prim | gds_prim_restart_continue_bit);
1443          radeon_emit(cs, start_prim + num_subdraw_prims - 1);
1444          radeon_emit(cs, count_va);
1445          radeon_emit(cs, vb_desc_va);
1446          radeon_emit(cs, vs_const_desc_va);
1447          radeon_emit(cs, vs_sampler_desc_va);
1448          radeon_emit(cs, base_vertex);
1449          radeon_emit(cs, info->start_instance);
1450          radeon_emit(cs, num_prims_udiv.multiplier);
1451          radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
1452          radeon_emit(cs, info->restart_index);
1453          /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
1454          radeon_emit(cs, fui(small_prim_cull_precision));
1455       } else {
1456          assert(VERTEX_COUNTER_GDS_MODE == 2);
1457          /* Only update the SGPRs that changed. */
1458          radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
1459          radeon_emit(cs, start_prim);
1460          radeon_emit(cs, start_prim + num_subdraw_prims - 1);
1461          radeon_emit(cs, count_va);
1462       }
1463
1464       /* Set grid dimensions. */
1465       unsigned start_block = start_prim / THREADGROUP_SIZE;
1466       unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
1467       unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
1468
1469       radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
1470       radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
1471                         S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
1472                            S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
1473
1474       radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
1475       radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
1476       radeon_emit(cs, 1);
1477       radeon_emit(cs, 1);
1478       radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
1479                          S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
1480                          S_00B800_ORDER_MODE(0 /* launch in order */));
1481
1482       /* This is only for unordered append. Ordered append writes this from
1483        * the shader.
1484        *
1485        * Note that EOP and EOS events are super slow, so emulating the event
1486        * in a shader is an important optimization.
1487        */
1488       if (VERTEX_COUNTER_GDS_MODE == 1) {
1489          si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
1490                            sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
1491                            EOP_INT_SEL_NONE, EOP_DATA_SEL_GDS, NULL,
1492                            count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
1493                            EOP_DATA_GDS(gds_offset / 4, 1), SI_NOT_QUERY);
1494
1495          /* Now that compute shaders are running, clear the remainder of GDS. */
1496          if (first_dispatch) {
1497             unsigned offset = gds_offset + gds_size;
1498             si_cp_dma_clear_buffer(
1499                sctx, cs, NULL, offset, GDS_SIZE_UNORDERED - offset, 0,
1500                SI_CPDMA_SKIP_CHECK_CS_SPACE | SI_CPDMA_SKIP_GFX_SYNC | SI_CPDMA_SKIP_SYNC_BEFORE,
1501                SI_COHERENCY_NONE, L2_BYPASS);
1502          }
1503       }
1504       first_dispatch = false;
1505
1506       assert(cs->current.cdw <= cs->current.max_dw);
1507       assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
1508    }
1509 }