- struct si_shader_key *key = &ctx->shader->key;
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef vs = ctx->main_fn;
-
- /* Always inline the VS function. */
- ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
- LLVMSetLinkage(vs, LLVMPrivateLinkage);
-
- enum ac_arg_type const_desc_type;
- if (ctx->shader->selector->info.const_buffers_declared == 1 &&
- ctx->shader->selector->info.shader_buffers_declared == 0)
- const_desc_type = AC_ARG_CONST_FLOAT_PTR;
- else
- const_desc_type = AC_ARG_CONST_DESC_PTR;
-
- memset(&ctx->args, 0, sizeof(ctx->args));
-
- struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
- struct ac_arg param_vb_desc, param_const_desc;
- struct ac_arg param_base_vertex, param_start_instance;
- struct ac_arg param_block_id, param_local_id, param_ordered_wave_id;
- struct ac_arg param_restart_index, param_smallprim_precision;
- struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
- struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr;
-
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
- ¶m_index_buffers_and_constants);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_counter);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_last_wave_prim_id);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_count_addr);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
- ¶m_vb_desc);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type,
- ¶m_const_desc);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
- ¶m_sampler_desc);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_instance);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_multiplier);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_terms);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_restart_index);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, ¶m_smallprim_precision);
-
- /* Block ID and thread ID inputs. */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_block_id);
- if (VERTEX_COUNTER_GDS_MODE == 2)
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_ordered_wave_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, ¶m_local_id);
-
- /* Create the compute shader function. */
- unsigned old_type = ctx->type;
- ctx->type = PIPE_SHADER_COMPUTE;
- si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
- ctx->type = old_type;
-
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- ac_llvm_add_target_dep_function_attr(ctx->main_fn,
- "amdgpu-gds-size", 256);
- } else if (VERTEX_COUNTER_GDS_MODE == 1) {
- ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size",
- GDS_SIZE_UNORDERED);
- }
-
- /* Assemble parameters for VS. */
- LLVMValueRef vs_params[16];
- unsigned num_vs_params = 0;
- unsigned param_vertex_id, param_instance_id;
-
- vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
- vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
- vs_params[num_vs_params++] = LLVMConstInt(ctx->ac.i32,
- S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
- vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
-
- vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
- vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
- vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */
- vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */
-
- assert(num_vs_params <= ARRAY_SIZE(vs_params));
- assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
-
- /* Load descriptors. (load 8 dwords at once) */
- LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
-
- LLVMValueRef index_buffers_and_constants = ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
- tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
- ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
- tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
-
- for (unsigned i = 0; i < 8; i++)
- desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
-
- input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
- output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
-
- /* Compute PrimID and InstanceID. */
- LLVMValueRef global_thread_id =
- ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
- LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
- ac_get_arg(&ctx->ac, param_local_id));
- LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
- LLVMValueRef instance_id = ctx->ac.i32_0;
-
- if (key->opt.cs_instancing) {
- LLVMValueRef num_prims_udiv_terms =
- ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
- LLVMValueRef num_prims_udiv_multiplier =
- ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
- /* Unpack num_prims_udiv_terms. */
- LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms,
- LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
- LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms,
- LLVMConstInt(ctx->ac.i32, 5, 0), "");
- /* Divide the total prim_id by the number of prims per instance. */
- instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id,
- num_prims_udiv_multiplier,
- post_shift);
- /* Compute the remainder. */
- prim_id = LLVMBuildSub(builder, prim_id,
- LLVMBuildMul(builder, instance_id,
- prims_per_instance, ""), "");
- }
-
- /* Generate indices (like a non-indexed draw call). */
- LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
- unsigned vertices_per_prim = 3;
-
- switch (key->opt.cs_prim_type) {
- case PIPE_PRIM_TRIANGLES:
- for (unsigned i = 0; i < 3; i++) {
- index[i] = ac_build_imad(&ctx->ac, prim_id,
- LLVMConstInt(ctx->ac.i32, 3, 0),
- LLVMConstInt(ctx->ac.i32, i, 0));
- }
- break;
- case PIPE_PRIM_TRIANGLE_STRIP:
- for (unsigned i = 0; i < 3; i++) {
- index[i] = LLVMBuildAdd(builder, prim_id,
- LLVMConstInt(ctx->ac.i32, i, 0), "");
- }
- break;
- case PIPE_PRIM_TRIANGLE_FAN:
- /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
- * and rasterizer as a normal triangle, so we need to put the provoking
- * vertex into the correct index variable and preserve orientation at the same time.
- * gl_VertexID is preserved, because it's equal to the index.
- */
- if (key->opt.cs_provoking_vertex_first) {
- index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
- index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
- index[2] = ctx->ac.i32_0;
- } else {
- index[0] = ctx->ac.i32_0;
- index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
- index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
- }
- break;
- default:
- unreachable("unexpected primitive type");
- }
-
- /* Fetch indices. */
- if (key->opt.cs_indexed) {
- for (unsigned i = 0; i < 3; i++) {
- index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf,
- index[i], ctx->ac.i32_0, 1,
- 0, true);
- index[i] = ac_to_integer(&ctx->ac, index[i]);
- }
- }
-
- LLVMValueRef ordered_wave_id = NULL;
-
- /* Extract the ordered wave ID. */
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id);
- ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id,
- LLVMConstInt(ctx->ac.i32, 6, 0), "");
- ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id,
- LLVMConstInt(ctx->ac.i32, 0xfff, 0), "");
- }
- LLVMValueRef thread_id =
- LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
- LLVMConstInt(ctx->ac.i32, 63, 0), "");
-
- /* Every other triangle in a strip has a reversed vertex order, so we
- * need to swap vertices of odd primitives to get the correct primitive
- * orientation when converting triangle strips to triangles. Primitive
- * restart complicates it, because a strip can start anywhere.
- */
- LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
- LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
-
- if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
- /* Without primitive restart, odd primitives have reversed orientation.
- * Only primitive restart can flip it with respect to the first vertex
- * of the draw call.
- */
- LLVMValueRef first_is_odd = ctx->ac.i1false;
-
- /* Handle primitive restart. */
- if (key->opt.cs_primitive_restart) {
- /* Get the GDS primitive restart continue flag and clear
- * the flag in vertex_counter. This flag is used when the draw
- * call was split and we need to load the primitive orientation
- * flag from GDS for the first wave too.
- */
- LLVMValueRef gds_prim_restart_continue =
- LLVMBuildLShr(builder, vertex_counter,
- LLVMConstInt(ctx->ac.i32, 31, 0), "");
- gds_prim_restart_continue =
- LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, "");
- vertex_counter = LLVMBuildAnd(builder, vertex_counter,
- LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), "");
-
- LLVMValueRef index0_is_reset;
-
- for (unsigned i = 0; i < 3; i++) {
- LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
- ac_get_arg(&ctx->ac, param_restart_index),
- "");
- if (i == 0)
- index0_is_reset = LLVMBuildNot(builder, not_reset, "");
- prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted,
- not_reset, "");
- }
-
- /* If the previous waves flip the primitive orientation
- * of the current triangle strip, it will be stored in GDS.
- *
- * Sometimes the correct orientation is not needed, in which case
- * we don't need to execute this.
- */
- if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
- /* If there are reset indices in this wave, get the thread index
- * where the most recent strip starts relative to each thread.
- */
- LLVMValueRef preceding_threads_mask =
- LLVMBuildSub(builder,
- LLVMBuildShl(builder, ctx->ac.i64_1,
- LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""),
- ctx->ac.i64_1, "");
-
- LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
- LLVMValueRef preceding_reset_threadmask =
- LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
- LLVMValueRef strip_start =
- ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
- strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, "");
-
- /* This flips the orientatino based on reset indices within this wave only. */
- first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, "");
-
- LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
- LLVMValueRef is_first_wave, current_wave_resets_index;
-
- /* Get the thread index where the last strip starts in this wave.
- *
- * If the last strip doesn't start in this wave, the thread index
- * will be 0.
- *
- * If the last strip starts in the next wave, the thread index will
- * be 64.
- */
- last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
- last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, "");
-
- struct si_thread0_section section;
- si_enter_thread0_section(ctx, §ion, thread_id);
-
- /* This must be done in the thread 0 section, because
- * we expect PrimID to be 0 for the whole first wave
- * in this expression.
- *
- * NOTE: This will need to be different if we wanna support
- * instancing with primitive restart.
- */
- is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, "");
- is_first_wave = LLVMBuildAnd(builder, is_first_wave,
- LLVMBuildNot(builder,
- gds_prim_restart_continue, ""), "");
- current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE,
- last_strip_start, ctx->ac.i32_0, "");
-
- ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state");
-
- /* Save the last strip start primitive index in GDS and read
- * the value that previous waves stored.
- *
- * if (is_first_wave || current_wave_resets_strip)
- * // Read the value that previous waves stored and store a new one.
- * first_is_odd = ds.ordered.swap(last_strip_start);
- * else
- * // Just read the value that previous waves stored.
- * first_is_odd = ds.ordered.add(0);
- */
- ac_build_ifcc(&ctx->ac,
- LLVMBuildOr(builder, is_first_wave,
- current_wave_resets_index, ""), 12602);
- {
- /* The GDS address is always 0 with ordered append. */
- tmp = si_build_ds_ordered_op(ctx, "swap",
- ordered_wave_id, last_strip_start,
- 1, true, false);
- LLVMBuildStore(builder, tmp, ret);
- }
- ac_build_else(&ctx->ac, 12603);
- {
- /* Just read the value from GDS. */
- tmp = si_build_ds_ordered_op(ctx, "add",
- ordered_wave_id, ctx->ac.i32_0,
- 1, true, false);
- LLVMBuildStore(builder, tmp, ret);
- }
- ac_build_endif(&ctx->ac, 12602);
-
- prev_wave_state = LLVMBuildLoad(builder, ret, "");
- /* Ignore the return value if this is the first wave. */
- prev_wave_state = LLVMBuildSelect(builder, is_first_wave,
- ctx->ac.i32_0, prev_wave_state, "");
- si_exit_thread0_section(§ion, &prev_wave_state);
- prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, "");
-
- /* If the strip start appears to be on thread 0 for the current primitive
- * (meaning the reset index is not present in this wave and might have
- * appeared in previous waves), use the value from GDS to determine
- * primitive orientation.
- *
- * If the strip start is in this wave for the current primitive, use
- * the value from the current wave to determine primitive orientation.
- */
- LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ,
- strip_start, ctx->ac.i32_0, "");
- first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state,
- first_is_odd, "");
- }
- }
- /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
- LLVMValueRef prim_is_odd =
- LLVMBuildXor(builder, first_is_odd,
- LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
-
- /* Convert triangle strip indices to triangle indices. */
- ac_build_triangle_strip_indices_to_triangle(&ctx->ac, prim_is_odd,
- LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
- index);
- }
-
- /* Execute the vertex shader for each vertex to get vertex positions. */
- LLVMValueRef pos[3][4];
- for (unsigned i = 0; i < vertices_per_prim; i++) {
- vs_params[param_vertex_id] = index[i];
- vs_params[param_instance_id] = instance_id;
-
- LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
- for (unsigned chan = 0; chan < 4; chan++)
- pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
- }
-
- /* Divide XYZ by W. */
- for (unsigned i = 0; i < vertices_per_prim; i++) {
- for (unsigned chan = 0; chan < 3; chan++)
- pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
- }
-
- /* Load the viewport state. */
- LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
- LLVMConstInt(ctx->ac.i32, 2, 0));
- vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
- LLVMValueRef vp_scale[2], vp_translate[2];
- vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
- vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
- vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
- vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
-
- /* Do culling. */
- struct ac_cull_options options = {};
- options.cull_front = key->opt.cs_cull_front;
- options.cull_back = key->opt.cs_cull_back;
- options.cull_view_xy = true;
- options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
- options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
- options.cull_small_prims = true;
- options.cull_zero_area = true;
- options.cull_w = true;
- options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
-
- LLVMValueRef accepted =
- ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted,
- vp_scale, vp_translate,
- ac_get_arg(&ctx->ac, param_smallprim_precision),
- &options);
-
- ac_build_optimization_barrier(&ctx->ac, &accepted);
- LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
-
- /* Count the number of active threads by doing bitcount(accepted). */
- LLVMValueRef num_prims_accepted =
- ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->ac.i64,
- &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
- num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
-
- LLVMValueRef start;
-
- /* Execute atomic_add on the vertex count. */
- struct si_thread0_section section;
- si_enter_thread0_section(ctx, §ion, thread_id);
- {
- if (VERTEX_COUNTER_GDS_MODE == 0) {
- LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
- LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
- vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
- start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
- vertex_counter, num_indices,
- LLVMAtomicOrderingMonotonic, false);
- } else if (VERTEX_COUNTER_GDS_MODE == 1) {
- LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
- LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
- vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
- LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), "");
- start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
- vertex_counter, num_indices,
- LLVMAtomicOrderingMonotonic, false);
- } else if (VERTEX_COUNTER_GDS_MODE == 2) {
- LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
- /* If the draw call was split into multiple subdraws, each using
- * a separate draw packet, we need to start counting from 0 for
- * the first compute wave of the subdraw.
- *
- * vertex_counter contains the primitive ID of the first thread
- * in the first wave.
- *
- * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
- */
- LLVMValueRef is_first_wave =
- LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
- vertex_counter, "");
-
- /* Store the primitive count for ordered append, not vertex count.
- * The idea is to avoid GDS initialization via CP DMA. The shader
- * effectively stores the first count using "swap".
- *
- * if (first_wave) {
- * ds.ordered.swap(num_prims_accepted); // store the first primitive count
- * previous = 0;
- * } else {
- * previous = ds.ordered.add(num_prims_accepted) // add the primitive count
- * }
- */
- ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
- {
- /* The GDS address is always 0 with ordered append. */
- si_build_ds_ordered_op(ctx, "swap", ordered_wave_id,
- num_prims_accepted, 0, true, true);
- LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store);
- }
- ac_build_else(&ctx->ac, 12605);
- {
- LLVMBuildStore(builder,
- si_build_ds_ordered_op(ctx, "add", ordered_wave_id,
- num_prims_accepted, 0,
- true, true),
- tmp_store);
- }
- ac_build_endif(&ctx->ac, 12604);
-
- start = LLVMBuildLoad(builder, tmp_store, "");
- }
- }
- si_exit_thread0_section(§ion, &start);
-
- /* Write the final vertex count to memory. An EOS/EOP event could do this,
- * but those events are super slow and should be avoided if performance
- * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
- * event like this.
- */
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- ac_build_ifcc(&ctx->ac,
- LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
- ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""),
- 12606);
- LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
- count = LLVMBuildMul(builder, count,
- LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-
- /* GFX8 needs to disable caching, so that the CP can see the stored value.
- * MTYPE=3 bypasses TC L2.
- */
- if (ctx->screen->info.chip_class <= GFX8) {
- LLVMValueRef desc[] = {
- ac_get_arg(&ctx->ac, param_vertex_count_addr),
- LLVMConstInt(ctx->ac.i32,
- S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
- LLVMConstInt(ctx->ac.i32, 4, 0),
- LLVMConstInt(ctx->ac.i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
- S_008F0C_MTYPE(3 /* uncached */), 0),
- };
- LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
- ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0,
- ctx->ac.i32_0, 0, ac_glc | ac_slc);
- } else {
- LLVMBuildStore(builder, count,
- si_expand_32bit_pointer(ctx,
- ac_get_arg(&ctx->ac,
- param_vertex_count_addr)));
- }
- ac_build_endif(&ctx->ac, 12606);
- } else {
- /* For unordered modes that increment a vertex count instead of
- * primitive count, convert it into the primitive index.
- */
- start = LLVMBuildUDiv(builder, start,
- LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
- }
-
- /* Now we need to store the indices of accepted primitives into
- * the output index buffer.
- */
- ac_build_ifcc(&ctx->ac, accepted, 16607);
- {
- /* Get the number of bits set before the index of this thread. */
- LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
-
- /* We have lowered instancing. Pack the instance ID into vertex ID. */
- if (key->opt.cs_instancing) {
- instance_id = LLVMBuildShl(builder, instance_id,
- LLVMConstInt(ctx->ac.i32, 16, 0), "");
-
- for (unsigned i = 0; i < vertices_per_prim; i++)
- index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
- }
-
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- /* vertex_counter contains the first primitive ID
- * for this dispatch. If the draw call was split into
- * multiple subdraws, the first primitive ID is > 0
- * for subsequent subdraws. Each subdraw uses a different
- * portion of the output index buffer. Offset the store
- * vindex by the first primitive ID to get the correct
- * store address for the subdraw.
- */
- start = LLVMBuildAdd(builder, start, vertex_counter, "");
- }
-
- /* Write indices for accepted primitives. */
- LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
- LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
-
- if (!ac_has_vec3_support(ctx->ac.chip_class, true))
- vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
-
- ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata,
- vindex, ctx->ac.i32_0, 3,
- ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
- }
- ac_build_endif(&ctx->ac, 16607);
-
- LLVMBuildRetVoid(builder);
+ struct si_shader_key *key = &ctx->shader->key;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef vs = ctx->main_fn;
+
+ /* Always inline the VS function. */
+ ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
+ LLVMSetLinkage(vs, LLVMPrivateLinkage);
+
+ enum ac_arg_type const_desc_type;
+ if (ctx->shader->selector->info.const_buffers_declared == 1 &&
+ ctx->shader->selector->info.shader_buffers_declared == 0)
+ const_desc_type = AC_ARG_CONST_FLOAT_PTR;
+ else
+ const_desc_type = AC_ARG_CONST_DESC_PTR;
+
+ memset(&ctx->args, 0, sizeof(ctx->args));
+
+ struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
+ struct ac_arg param_vb_desc, param_const_desc;
+ struct ac_arg param_base_vertex, param_start_instance;
+ struct ac_arg param_block_id, param_local_id, param_ordered_wave_id;
+ struct ac_arg param_restart_index, param_smallprim_precision;
+ struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
+ struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr;
+
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
+ ¶m_index_buffers_and_constants);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_counter);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_last_wave_prim_id);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_count_addr);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, ¶m_vb_desc);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, ¶m_const_desc);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, ¶m_sampler_desc);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_instance);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_multiplier);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_terms);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_restart_index);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, ¶m_smallprim_precision);
+
+ /* Block ID and thread ID inputs. */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_block_id);
+ if (VERTEX_COUNTER_GDS_MODE == 2)
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_ordered_wave_id);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, ¶m_local_id);
+
+ /* Create the compute shader function. */
+ unsigned old_type = ctx->type;
+ ctx->type = PIPE_SHADER_COMPUTE;
+ si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
+ ctx->type = old_type;
+
+ if (VERTEX_COUNTER_GDS_MODE == 2) {
+ ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
+ } else if (VERTEX_COUNTER_GDS_MODE == 1) {
+ ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", GDS_SIZE_UNORDERED);
+ }
+
+ /* Assemble parameters for VS. */
+ LLVMValueRef vs_params[16];
+ unsigned num_vs_params = 0;
+ unsigned param_vertex_id, param_instance_id;
+
+ vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
+ vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
+ vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
+ vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
+ vs_params[num_vs_params++] =
+ LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
+ vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
+ vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
+ vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
+ vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
+
+ vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
+ vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
+ vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */
+ vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */
+
+ assert(num_vs_params <= ARRAY_SIZE(vs_params));
+ assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
+
+ /* Load descriptors. (load 8 dwords at once) */
+ LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
+
+ LLVMValueRef index_buffers_and_constants =
+ ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
+ tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
+ ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
+ tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
+
+ for (unsigned i = 0; i < 8; i++)
+ desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
+
+ input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
+ output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
+
+ /* Compute PrimID and InstanceID. */
+ LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
+ LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
+ ac_get_arg(&ctx->ac, param_local_id));
+ LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
+ LLVMValueRef instance_id = ctx->ac.i32_0;
+
+ if (key->opt.cs_instancing) {
+ LLVMValueRef num_prims_udiv_terms = ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
+ LLVMValueRef num_prims_udiv_multiplier =
+ ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
+ /* Unpack num_prims_udiv_terms. */
+ LLVMValueRef post_shift =
+ LLVMBuildAnd(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
+ LLVMValueRef prims_per_instance =
+ LLVMBuildLShr(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 5, 0), "");
+ /* Divide the total prim_id by the number of prims per instance. */
+ instance_id =
+ ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, num_prims_udiv_multiplier, post_shift);
+ /* Compute the remainder. */
+ prim_id = LLVMBuildSub(builder, prim_id,
+ LLVMBuildMul(builder, instance_id, prims_per_instance, ""), "");
+ }
+
+ /* Generate indices (like a non-indexed draw call). */
+ LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
+ unsigned vertices_per_prim = 3;
+
+ switch (key->opt.cs_prim_type) {
+ case PIPE_PRIM_TRIANGLES:
+ for (unsigned i = 0; i < 3; i++) {
+ index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0),
+ LLVMConstInt(ctx->ac.i32, i, 0));
+ }
+ break;
+ case PIPE_PRIM_TRIANGLE_STRIP:
+ for (unsigned i = 0; i < 3; i++) {
+ index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), "");
+ }
+ break;
+ case PIPE_PRIM_TRIANGLE_FAN:
+ /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
+ * and rasterizer as a normal triangle, so we need to put the provoking
+ * vertex into the correct index variable and preserve orientation at the same time.
+ * gl_VertexID is preserved, because it's equal to the index.
+ */
+ if (key->opt.cs_provoking_vertex_first) {
+ index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
+ index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
+ index[2] = ctx->ac.i32_0;
+ } else {
+ index[0] = ctx->ac.i32_0;
+ index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
+ index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
+ }
+ break;
+ default:
+ unreachable("unexpected primitive type");
+ }
+
+ /* Fetch indices. */
+ if (key->opt.cs_indexed) {
+ for (unsigned i = 0; i < 3; i++) {
+ index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,
+ 1, 0, true);
+ index[i] = ac_to_integer(&ctx->ac, index[i]);
+ }
+ }
+
+ LLVMValueRef ordered_wave_id = NULL;
+
+ /* Extract the ordered wave ID. */
+ if (VERTEX_COUNTER_GDS_MODE == 2) {
+ ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id);
+ ordered_wave_id =
+ LLVMBuildLShr(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 6, 0), "");
+ ordered_wave_id =
+ LLVMBuildAnd(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 0xfff, 0), "");
+ }
+ LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
+ LLVMConstInt(ctx->ac.i32, 63, 0), "");
+
+ /* Every other triangle in a strip has a reversed vertex order, so we
+ * need to swap vertices of odd primitives to get the correct primitive
+ * orientation when converting triangle strips to triangles. Primitive
+ * restart complicates it, because a strip can start anywhere.
+ */
+ LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
+ LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
+
+ if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
+ /* Without primitive restart, odd primitives have reversed orientation.
+ * Only primitive restart can flip it with respect to the first vertex
+ * of the draw call.
+ */
+ LLVMValueRef first_is_odd = ctx->ac.i1false;
+
+ /* Handle primitive restart. */
+ if (key->opt.cs_primitive_restart) {
+ /* Get the GDS primitive restart continue flag and clear
+ * the flag in vertex_counter. This flag is used when the draw
+ * call was split and we need to load the primitive orientation
+ * flag from GDS for the first wave too.
+ */
+ LLVMValueRef gds_prim_restart_continue =
+ LLVMBuildLShr(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 31, 0), "");
+ gds_prim_restart_continue =
+ LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, "");
+ vertex_counter =
+ LLVMBuildAnd(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), "");
+
+ LLVMValueRef index0_is_reset;
+
+ for (unsigned i = 0; i < 3; i++) {
+ LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
+ ac_get_arg(&ctx->ac, param_restart_index), "");
+ if (i == 0)
+ index0_is_reset = LLVMBuildNot(builder, not_reset, "");
+ prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, not_reset, "");
+ }
+
+ /* If the previous waves flip the primitive orientation
+ * of the current triangle strip, it will be stored in GDS.
+ *
+ * Sometimes the correct orientation is not needed, in which case
+ * we don't need to execute this.
+ */
+ if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
+ /* If there are reset indices in this wave, get the thread index
+ * where the most recent strip starts relative to each thread.
+ */
+ LLVMValueRef preceding_threads_mask =
+ LLVMBuildSub(builder,
+ LLVMBuildShl(builder, ctx->ac.i64_1,
+ LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""),
+ ctx->ac.i64_1, "");
+
+ LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
+ LLVMValueRef preceding_reset_threadmask =
+ LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
+ LLVMValueRef strip_start = ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
+ strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, "");
+
+ /* This flips the orientatino based on reset indices within this wave only. */
+ first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, "");
+
+ LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
+ LLVMValueRef is_first_wave, current_wave_resets_index;
+
+ /* Get the thread index where the last strip starts in this wave.
+ *
+ * If the last strip doesn't start in this wave, the thread index
+ * will be 0.
+ *
+ * If the last strip starts in the next wave, the thread index will
+ * be 64.
+ */
+ last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
+ last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, "");
+
+ struct si_thread0_section section;
+ si_enter_thread0_section(ctx, §ion, thread_id);
+
+ /* This must be done in the thread 0 section, because
+ * we expect PrimID to be 0 for the whole first wave
+ * in this expression.
+ *
+ * NOTE: This will need to be different if we wanna support
+ * instancing with primitive restart.
+ */
+ is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, "");
+ is_first_wave = LLVMBuildAnd(builder, is_first_wave,
+ LLVMBuildNot(builder, gds_prim_restart_continue, ""), "");
+ current_wave_resets_index =
+ LLVMBuildICmp(builder, LLVMIntNE, last_strip_start, ctx->ac.i32_0, "");
+
+ ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state");
+
+ /* Save the last strip start primitive index in GDS and read
+ * the value that previous waves stored.
+ *
+ * if (is_first_wave || current_wave_resets_strip)
+ * // Read the value that previous waves stored and store a new one.
+ * first_is_odd = ds.ordered.swap(last_strip_start);
+ * else
+ * // Just read the value that previous waves stored.
+ * first_is_odd = ds.ordered.add(0);
+ */
+ ac_build_ifcc(
+ &ctx->ac, LLVMBuildOr(builder, is_first_wave, current_wave_resets_index, ""), 12602);
+ {
+ /* The GDS address is always 0 with ordered append. */
+ tmp = si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, last_strip_start, 1, true,
+ false);
+ LLVMBuildStore(builder, tmp, ret);
+ }
+ ac_build_else(&ctx->ac, 12603);
+ {
+ /* Just read the value from GDS. */
+ tmp = si_build_ds_ordered_op(ctx, "add", ordered_wave_id, ctx->ac.i32_0, 1, true,
+ false);
+ LLVMBuildStore(builder, tmp, ret);
+ }
+ ac_build_endif(&ctx->ac, 12602);
+
+ prev_wave_state = LLVMBuildLoad(builder, ret, "");
+ /* Ignore the return value if this is the first wave. */
+ prev_wave_state =
+ LLVMBuildSelect(builder, is_first_wave, ctx->ac.i32_0, prev_wave_state, "");
+ si_exit_thread0_section(§ion, &prev_wave_state);
+ prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, "");
+
+ /* If the strip start appears to be on thread 0 for the current primitive
+ * (meaning the reset index is not present in this wave and might have
+ * appeared in previous waves), use the value from GDS to determine
+ * primitive orientation.
+ *
+ * If the strip start is in this wave for the current primitive, use
+ * the value from the current wave to determine primitive orientation.
+ */
+ LLVMValueRef strip_start_is0 =
+ LLVMBuildICmp(builder, LLVMIntEQ, strip_start, ctx->ac.i32_0, "");
+ first_is_odd =
+ LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, first_is_odd, "");
+ }
+ }
+ /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
+ LLVMValueRef prim_is_odd = LLVMBuildXor(
+ builder, first_is_odd, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
+
+ /* Convert triangle strip indices to triangle indices. */
+ ac_build_triangle_strip_indices_to_triangle(
+ &ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
+ index);
+ }
+
+ /* Execute the vertex shader for each vertex to get vertex positions. */
+ LLVMValueRef pos[3][4];
+ for (unsigned i = 0; i < vertices_per_prim; i++) {
+ vs_params[param_vertex_id] = index[i];
+ vs_params[param_instance_id] = instance_id;
+
+ LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
+ for (unsigned chan = 0; chan < 4; chan++)
+ pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
+ }
+
+ /* Divide XYZ by W. */
+ for (unsigned i = 0; i < vertices_per_prim; i++) {
+ for (unsigned chan = 0; chan < 3; chan++)
+ pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
+ }
+
+ /* Load the viewport state. */
+ LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
+ LLVMConstInt(ctx->ac.i32, 2, 0));
+ vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
+ LLVMValueRef vp_scale[2], vp_translate[2];
+ vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
+ vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
+ vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
+ vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
+
+ /* Do culling. */
+ struct ac_cull_options options = {};
+ options.cull_front = key->opt.cs_cull_front;
+ options.cull_back = key->opt.cs_cull_back;
+ options.cull_view_xy = true;
+ options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
+ options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
+ options.cull_small_prims = true;
+ options.cull_zero_area = true;
+ options.cull_w = true;
+ options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
+
+ LLVMValueRef accepted =
+ ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate,
+ ac_get_arg(&ctx->ac, param_smallprim_precision), &options);
+
+ ac_build_optimization_barrier(&ctx->ac, &accepted);
+ LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
+
+ /* Count the number of active threads by doing bitcount(accepted). */
+ LLVMValueRef num_prims_accepted = ac_build_intrinsic(
+ &ctx->ac, "llvm.ctpop.i64", ctx->ac.i64, &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
+ num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
+
+ LLVMValueRef start;
+
+ /* Execute atomic_add on the vertex count. */
+ struct si_thread0_section section;
+ si_enter_thread0_section(ctx, §ion, thread_id);
+ {
+ if (VERTEX_COUNTER_GDS_MODE == 0) {
+ LLVMValueRef num_indices = LLVMBuildMul(
+ builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+ vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
+ start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
+ LLVMAtomicOrderingMonotonic, false);
+ } else if (VERTEX_COUNTER_GDS_MODE == 1) {
+ LLVMValueRef num_indices = LLVMBuildMul(
+ builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+ vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
+ LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), "");
+ start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
+ LLVMAtomicOrderingMonotonic, false);
+ } else if (VERTEX_COUNTER_GDS_MODE == 2) {
+ LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+ /* If the draw call was split into multiple subdraws, each using
+ * a separate draw packet, we need to start counting from 0 for
+ * the first compute wave of the subdraw.
+ *
+ * vertex_counter contains the primitive ID of the first thread
+ * in the first wave.
+ *
+ * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
+ */
+ LLVMValueRef is_first_wave =
+ LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, vertex_counter, "");
+
+ /* Store the primitive count for ordered append, not vertex count.
+ * The idea is to avoid GDS initialization via CP DMA. The shader
+ * effectively stores the first count using "swap".
+ *
+ * if (first_wave) {
+ * ds.ordered.swap(num_prims_accepted); // store the first primitive count
+ * previous = 0;
+ * } else {
+ * previous = ds.ordered.add(num_prims_accepted) // add the primitive count
+ * }
+ */
+ ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
+ {
+ /* The GDS address is always 0 with ordered append. */
+ si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, num_prims_accepted, 0, true, true);
+ LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store);
+ }
+ ac_build_else(&ctx->ac, 12605);
+ {
+ LLVMBuildStore(builder,
+ si_build_ds_ordered_op(ctx, "add", ordered_wave_id, num_prims_accepted,
+ 0, true, true),
+ tmp_store);
+ }
+ ac_build_endif(&ctx->ac, 12604);
+
+ start = LLVMBuildLoad(builder, tmp_store, "");
+ }
+ }
+ si_exit_thread0_section(§ion, &start);
+
+ /* Write the final vertex count to memory. An EOS/EOP event could do this,
+ * but those events are super slow and should be avoided if performance
+ * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
+ * event like this.
+ */
+ if (VERTEX_COUNTER_GDS_MODE == 2) {
+ ac_build_ifcc(&ctx->ac,
+ LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
+ ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""),
+ 12606);
+ LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
+ count = LLVMBuildMul(builder, count, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+
+ /* GFX8 needs to disable caching, so that the CP can see the stored value.
+ * MTYPE=3 bypasses TC L2.
+ */
+ if (ctx->screen->info.chip_class <= GFX8) {
+ LLVMValueRef desc[] = {
+ ac_get_arg(&ctx->ac, param_vertex_count_addr),
+ LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
+ LLVMConstInt(ctx->ac.i32, 4, 0),
+ LLVMConstInt(
+ ctx->ac.i32,
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | S_008F0C_MTYPE(3 /* uncached */),
+ 0),
+ };
+ LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
+ ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0, ctx->ac.i32_0, 0,
+ ac_glc | ac_slc);
+ } else {
+ LLVMBuildStore(
+ builder, count,
+ si_expand_32bit_pointer(ctx, ac_get_arg(&ctx->ac, param_vertex_count_addr)));
+ }
+ ac_build_endif(&ctx->ac, 12606);
+ } else {
+ /* For unordered modes that increment a vertex count instead of
+ * primitive count, convert it into the primitive index.
+ */
+ start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+ }
+
+ /* Now we need to store the indices of accepted primitives into
+ * the output index buffer.
+ */
+ ac_build_ifcc(&ctx->ac, accepted, 16607);
+ {
+ /* Get the number of bits set before the index of this thread. */
+ LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
+
+ /* We have lowered instancing. Pack the instance ID into vertex ID. */
+ if (key->opt.cs_instancing) {
+ instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
+
+ for (unsigned i = 0; i < vertices_per_prim; i++)
+ index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
+ }
+
+ if (VERTEX_COUNTER_GDS_MODE == 2) {
+ /* vertex_counter contains the first primitive ID
+ * for this dispatch. If the draw call was split into
+ * multiple subdraws, the first primitive ID is > 0
+ * for subsequent subdraws. Each subdraw uses a different
+ * portion of the output index buffer. Offset the store
+ * vindex by the first primitive ID to get the correct
+ * store address for the subdraw.
+ */
+ start = LLVMBuildAdd(builder, start, vertex_counter, "");
+ }
+
+ /* Write indices for accepted primitives. */
+ LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
+ LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
+
+ if (!ac_has_vec3_support(ctx->ac.chip_class, true))
+ vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
+
+ ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0, 3,
+ ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
+ }
+ ac_build_endif(&ctx->ac, 16607);
+
+ LLVMBuildRetVoid(builder);