From c9b7a37b8f7979433655e269a2b161d33eb41659 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 14 Aug 2018 02:01:18 -0400 Subject: [PATCH] radeonsi: cull primitives with async compute for large draw calls MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Tested-by: Dieter Nützel Acked-by: Nicolai Hähnle --- src/gallium/drivers/radeonsi/Makefile.sources | 1 + src/gallium/drivers/radeonsi/meson.build | 1 + .../radeonsi/si_compute_prim_discard.c | 1567 +++++++++++++++++ src/gallium/drivers/radeonsi/si_cp_dma.c | 8 +- src/gallium/drivers/radeonsi/si_debug.c | 32 +- src/gallium/drivers/radeonsi/si_fence.c | 8 +- src/gallium/drivers/radeonsi/si_gfx_cs.c | 80 + src/gallium/drivers/radeonsi/si_pipe.c | 12 +- src/gallium/drivers/radeonsi/si_pipe.h | 63 +- src/gallium/drivers/radeonsi/si_query.c | 6 + src/gallium/drivers/radeonsi/si_shader.c | 63 + src/gallium/drivers/radeonsi/si_shader.h | 14 + src/gallium/drivers/radeonsi/si_state.c | 9 + src/gallium/drivers/radeonsi/si_state.h | 5 + src/gallium/drivers/radeonsi/si_state_draw.c | 252 ++- src/gallium/drivers/radeonsi/si_state_msaa.c | 4 + .../drivers/radeonsi/si_state_shaders.c | 21 +- .../drivers/radeonsi/si_state_viewport.c | 6 + 18 files changed, 2124 insertions(+), 28 deletions(-) create mode 100644 src/gallium/drivers/radeonsi/si_compute_prim_discard.c diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources index 713629c6e87..62747f57b87 100644 --- a/src/gallium/drivers/radeonsi/Makefile.sources +++ b/src/gallium/drivers/radeonsi/Makefile.sources @@ -10,6 +10,7 @@ C_SOURCES := \ si_build_pm4.h \ si_clear.c \ si_compute.c \ + si_compute_prim_discard.c \ si_compute.h \ si_compute_blit.c \ si_cp_dma.c \ diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index cf3b24cd358..ae216bc1858 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -26,6 +26,7 @@ files_libradeonsi = files( 'si_build_pm4.h', 'si_clear.c', 'si_compute.c', + 'si_compute_prim_discard.c', 'si_compute.h', 'si_compute_blit.c', 'si_cp_dma.c', diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c new file mode 100644 index 00000000000..71253c50092 --- /dev/null +++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c @@ -0,0 +1,1567 @@ +/* + * Copyright 2019 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "si_pipe.h" +#include "si_shader_internal.h" +#include "sid.h" +#include "si_build_pm4.h" +#include "ac_llvm_cull.h" + +#include "util/u_prim.h" +#include "util/u_suballoc.h" +#include "util/u_upload_mgr.h" +#include "util/fast_idiv_by_const.h" + +/* Based on: + * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf + */ + +/* This file implements primitive culling using asynchronous compute. + * It's written to be GL conformant. + * + * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it + * in a compute shader. The shader processes 1 primitive/thread by invoking + * the VS for each vertex to get the positions, decomposes strips and fans + * into triangles (if needed), eliminates primitive restart (if needed), + * does (W<0) culling, face culling, view XY culling, zero-area and + * small-primitive culling, and generates a new index buffer that doesn't + * contain culled primitives. + * + * The index buffer is generated using the Ordered Count feature of GDS, + * which is an atomic counter that is incremented in the wavefront launch + * order, so that the original primitive order is preserved. + * + * Another GDS ordered counter is used to eliminate primitive restart indices. + * If a restart index lands on an even thread ID, the compute shader has to flip + * the primitive orientation of the whole following triangle strip. The primitive + * orientation has to be correct after strip and fan decomposition for two-sided + * shading to behave correctly. The decomposition also needs to be aware of + * which vertex is the provoking vertex for flat shading to behave correctly. + * + * IB = a GPU command buffer + * + * Both the compute and gfx IBs run in parallel sort of like CE and DE. + * The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND + * doesn't continue if its word isn't 0x80000000. Once compute shaders are + * finished culling, the last wave will write the final primitive count from + * GDS directly into the count word of the draw packet in the gfx IB, and + * a CS_DONE event will signal the REWIND packet to continue. It's really + * a direct draw with command buffer patching from the compute queue. + * + * The compute IB doesn't have to start when its corresponding gfx IB starts, + * but can start sooner. The compute IB is signaled to start after the last + * execution barrier in the *previous* gfx IB. This is handled as follows. + * The kernel GPU scheduler starts the compute IB after the previous gfx IB has + * started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that + * represents the barrier in the previous gfx IB. + * + * Features: + * - Triangle strips and fans are decomposed into an indexed triangle list. + * The decomposition differs based on the provoking vertex state. + * - Instanced draws are converted into non-instanced draws for 16-bit indices. + * (InstanceID is stored in the high bits of VertexID and unpacked by VS) + * - Primitive restart is fully supported with triangle strips, including + * correct primitive orientation across multiple waves. (restart indices + * reset primitive orientation) + * - W<0 culling (W<0 is behind the viewer, sort of like near Z culling). + * - Back face culling, incl. culling zero-area / degenerate primitives. + * - View XY culling. + * - View Z culling (disabled due to limited impact with perspective projection). + * - Small primitive culling for all MSAA modes and all quant modes. + * + * The following are not implemented: + * - ClipVertex/ClipDistance/CullDistance-based culling. + * - Scissor culling. + * - HiZ culling. + * + * Limitations (and unimplemented features that may be possible to implement): + * - Only triangles, triangle strips, and triangle fans are supported. + * - Primitive restart is only supported with triangle strips. + * - Instancing and primitive restart can't be used together. + * - Instancing is only supported with 16-bit indices and instance count <= 2^16. + * - The instance divisor buffer is unavailable, so all divisors must be + * either 0 or 1. + * - Multidraws where the vertex shader reads gl_DrawID are unsupported. + * - No support for tessellation and geometry shaders. + * (patch elimination where tess factors are 0 would be possible to implement) + * - The vertex shader must not contain memory stores. + * - All VS resources must not have a write usage in the command buffer. + * (TODO: all shader buffers currently set the write usage) + * - Bindless textures and images must not occur in the vertex shader. + * + * User data SGPR layout: + * INDEX_BUFFERS: pointer to constants + * 0..3: input index buffer - typed buffer view + * 4..7: output index buffer - typed buffer view + * 8..11: viewport state - scale.xy, translate.xy + * VERTEX_COUNTER: counter address or first primitive ID + * - If unordered memory counter: address of "count" in the draw packet + * and is incremented atomically by the shader. + * - If unordered GDS counter: address of "count" in GDS starting from 0, + * must be initialized to 0 before the dispatch. + * - If ordered GDS counter: the primitive ID that should reset the vertex + * counter to 0 in GDS + * LAST_WAVE_PRIM_ID: the primitive ID that should write the final vertex + * count to memory if using GDS ordered append + * VERTEX_COUNT_ADDR: where the last wave should write the vertex count if + * using GDS ordered append + * VS.VERTEX_BUFFERS: same value as VS + * VS.CONST_AND_SHADER_BUFFERS: same value as VS + * VS.SAMPLERS_AND_IMAGES: same value as VS + * VS.BASE_VERTEX: same value as VS + * VS.START_INSTANCE: same value as VS + * NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives + * per instance for instancing. + * NUM_PRIMS_UDIV_TERMS: + * - Bits [0:4]: "post_shift" for fast 31-bit division for instancing. + * - Bits [5:31]: The number of primitives per instance for computing the remainder. + * PRIMITIVE_RESTART_INDEX + * SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number. + * + * + * The code contains 3 codepaths: + * - Unordered memory counter (for debugging, random primitive order, no primitive restart) + * - Unordered GDS counter (for debugging, random primitive order, no primitive restart) + * - Ordered GDS counter (it preserves the primitive order) + * + * How to test primitive restart (the most complicated part because it needs + * to get the primitive orientation right): + * Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave + * primitive orientation flips with small draw calls, which is what most tests use. + * You can also enable draw call splitting into draw calls with just 2 primitives. + */ + +/* At least 256 is needed for the fastest wave launch rate from compute queues + * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */ +#define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */ +#define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */ +#define MAX_WAVES_PER_SH 0 /* no limit */ +#define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */ +/* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */ +#define CULL_Z 0 +/* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */ +#define VERTEX_COUNTER_GDS_MODE 2 +#define GDS_SIZE_UNORDERED (4 * 1024) /* only for the unordered GDS counter */ + +/* Grouping compute dispatches for small draw calls: How many primitives from multiple + * draw calls to process by compute before signaling the gfx IB. This reduces the number + * of EOP events + REWIND packets, because they decrease performance. */ +#define PRIMS_PER_BATCH (512 * 1024) +/* Draw call splitting at the packet level. This allows signaling the gfx IB + * for big draw calls sooner, but doesn't allow context flushes between packets. + * Primitive restart is supported. Only implemented for ordered append. */ +#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH +/* If there is not enough ring buffer space for the current IB, split draw calls into + * this number of primitives, so that we can flush the context and get free ring space. */ +#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH + +/* Derived values. */ +#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64) +#define SPLIT_PRIMS_PACKET_LEVEL (VERTEX_COUNTER_GDS_MODE == 2 ? \ + SPLIT_PRIMS_PACKET_LEVEL_VALUE : \ + UINT_MAX & ~(THREADGROUP_SIZE - 1)) + +#define REWIND_SIGNAL_BIT 0x80000000 +/* For emulating the rewind packet on CI. */ +#define FORCE_REWIND_EMULATION 0 + +void si_initialize_prim_discard_tunables(struct si_context *sctx) +{ + sctx->prim_discard_vertex_count_threshold = UINT_MAX; /* disable */ + + if (sctx->chip_class == GFX6 || /* SI support is not implemented */ + !sctx->screen->info.has_gds_ordered_append || + sctx->screen->debug_flags & DBG(NO_PD) || + /* If aux_context == NULL, we are initializing aux_context right now. */ + !sctx->screen->aux_context) + return; + + /* TODO: enable this after the GDS kernel memory management is fixed */ + bool enable_on_pro_graphics_by_default = false; + + if (sctx->screen->debug_flags & DBG(ALWAYS_PD) || + sctx->screen->debug_flags & DBG(PD) || + (enable_on_pro_graphics_by_default && + sctx->screen->info.is_pro_graphics && + (sctx->family == CHIP_BONAIRE || + sctx->family == CHIP_HAWAII || + sctx->family == CHIP_TONGA || + sctx->family == CHIP_FIJI || + sctx->family == CHIP_POLARIS10 || + sctx->family == CHIP_POLARIS11 || + sctx->family == CHIP_VEGA10 || + sctx->family == CHIP_VEGA20))) { + sctx->prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */ + + if (sctx->screen->debug_flags & DBG(ALWAYS_PD)) + sctx->prim_discard_vertex_count_threshold = 0; /* always enable */ + + const uint32_t MB = 1024 * 1024; + const uint64_t GB = 1024 * 1024 * 1024; + + /* The total size is double this per context. + * Greater numbers allow bigger gfx IBs. + */ + if (sctx->screen->info.vram_size <= 2 * GB) + sctx->index_ring_size_per_ib = 64 * MB; + else if (sctx->screen->info.vram_size <= 4 * GB) + sctx->index_ring_size_per_ib = 128 * MB; + else + sctx->index_ring_size_per_ib = 256 * MB; + } +} + +/* Opcode can be "add" or "swap". */ +static LLVMValueRef +si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode, + LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index, + bool release, bool done) +{ + LLVMValueRef args[] = { + LLVMBuildIntToPtr(ctx->ac.builder, m0, + LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""), + value, + LLVMConstInt(ctx->i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */ + ctx->i32_0, /* scope */ + ctx->i1false, /* volatile */ + LLVMConstInt(ctx->i32, ordered_count_index, 0), + LLVMConstInt(ctx->i1, release, 0), + LLVMConstInt(ctx->i1, done, 0), + }; + + char intrinsic[64]; + snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode); + return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->i32, args, ARRAY_SIZE(args), 0); +} + +static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr) +{ + uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32; + ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->i64, ""); + ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->i64, hi, 0), ""); + return LLVMBuildIntToPtr(ctx->ac.builder, ptr, + LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GLOBAL), ""); +} + +struct si_thread0_section { + struct si_shader_context *ctx; + struct lp_build_if_state if_thread0; + LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */ + LLVMValueRef saved_exec; +}; + +/* Enter a section that only executes on thread 0. */ +static void si_enter_thread0_section(struct si_shader_context *ctx, + struct si_thread0_section *section, + LLVMValueRef thread_id) +{ + section->ctx = ctx; + section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->i32, "result0"); + + /* This IF has 4 instructions: + * v_and_b32_e32 v, 63, v ; get the thread ID + * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0 + * s_and_saveexec_b64 s, vcc + * s_cbranch_execz BB0_4 + * + * It could just be s_and_saveexec_b64 s, 1. + */ + lp_build_if(§ion->if_thread0, &ctx->gallivm, + LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, + ctx->i32_0, "")); +} + +/* Exit a section that only executes on thread 0 and broadcast the result + * to all threads. */ +static void si_exit_thread0_section(struct si_thread0_section *section, + LLVMValueRef *result) +{ + struct si_shader_context *ctx = section->ctx; + + LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result); + + lp_build_endif(§ion->if_thread0); + + /* Broadcast the result from thread 0 to all threads. */ + *result = ac_build_readlane(&ctx->ac, + LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL); +} + +void si_build_prim_discard_compute_shader(struct si_shader_context *ctx) +{ + struct si_shader_key *key = &ctx->shader->key; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef vs = ctx->main_fn; + + /* Always inline the VS function. */ + ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE); + LLVMSetLinkage(vs, LLVMPrivateLinkage); + + LLVMTypeRef const_desc_type; + if (ctx->shader->selector->info.const_buffers_declared == 1 && + ctx->shader->selector->info.shader_buffers_declared == 0) + const_desc_type = ctx->f32; + else + const_desc_type = ctx->v4i32; + + struct si_function_info fninfo; + si_init_function_info(&fninfo); + + LLVMValueRef index_buffers_and_constants, vertex_counter, vb_desc, const_desc; + LLVMValueRef base_vertex, start_instance, block_id, local_id, ordered_wave_id; + LLVMValueRef restart_index, vp_scale[2], vp_translate[2], smallprim_precision; + LLVMValueRef num_prims_udiv_multiplier, num_prims_udiv_terms, sampler_desc; + LLVMValueRef last_wave_prim_id, vertex_count_addr; + + add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32), + &index_buffers_and_constants); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_counter); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &last_wave_prim_id); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_count_addr); + add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32), + &vb_desc); + add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(const_desc_type), + &const_desc); + add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v8i32), + &sampler_desc); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &base_vertex); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &start_instance); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_multiplier); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_terms); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &restart_index); + add_arg_assign(&fninfo, ARG_SGPR, ctx->f32, &smallprim_precision); + + /* Block ID and thread ID inputs. */ + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &block_id); + if (VERTEX_COUNTER_GDS_MODE == 2) + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ordered_wave_id); + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &local_id); + + /* Create the compute shader function. */ + unsigned old_type = ctx->type; + ctx->type = PIPE_SHADER_COMPUTE; + si_create_function(ctx, "prim_discard_cs", NULL, 0, &fninfo, THREADGROUP_SIZE); + ctx->type = old_type; + + if (VERTEX_COUNTER_GDS_MODE == 1) { + ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", + GDS_SIZE_UNORDERED); + } + + /* Assemble parameters for VS. */ + LLVMValueRef vs_params[16]; + unsigned num_vs_params = 0; + unsigned param_vertex_id, param_instance_id; + + vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */ + vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */ + vs_params[num_vs_params++] = const_desc; + vs_params[num_vs_params++] = sampler_desc; + vs_params[num_vs_params++] = LLVMConstInt(ctx->i32, + S_VS_STATE_INDEXED(key->opt.cs_indexed), 0); + vs_params[num_vs_params++] = base_vertex; + vs_params[num_vs_params++] = start_instance; + vs_params[num_vs_params++] = ctx->i32_0; /* DrawID */ + vs_params[num_vs_params++] = vb_desc; + + vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */ + vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */ + vs_params[num_vs_params++] = ctx->i32_0; /* unused (PrimID) */ + vs_params[num_vs_params++] = ctx->i32_0; /* unused */ + + assert(num_vs_params <= ARRAY_SIZE(vs_params)); + assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs)))); + + /* Load descriptors. (load 8 dwords at once) */ + LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8]; + + tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants, + ac_array_in_const32_addr_space(ctx->v8i32), ""); + tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->i32_0); + + for (unsigned i = 0; i < 8; i++) + desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i); + + input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4); + output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4); + + /* Compute PrimID and InstanceID. */ + LLVMValueRef global_thread_id = + ac_build_imad(&ctx->ac, block_id, + LLVMConstInt(ctx->i32, THREADGROUP_SIZE, 0), local_id); + LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */ + LLVMValueRef instance_id = ctx->i32_0; + + if (key->opt.cs_instancing) { + /* Unpack num_prims_udiv_terms. */ + LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms, + LLVMConstInt(ctx->i32, 0x1f, 0), ""); + LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms, + LLVMConstInt(ctx->i32, 5, 0), ""); + /* Divide the total prim_id by the number of prims per instance. */ + instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, + num_prims_udiv_multiplier, + post_shift); + /* Compute the remainder. */ + prim_id = LLVMBuildSub(builder, prim_id, + LLVMBuildMul(builder, instance_id, + prims_per_instance, ""), ""); + } + + /* Generate indices (like a non-indexed draw call). */ + LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->i32)}; + unsigned vertices_per_prim = 3; + + switch (key->opt.cs_prim_type) { + case PIPE_PRIM_TRIANGLES: + for (unsigned i = 0; i < 3; i++) { + index[i] = ac_build_imad(&ctx->ac, prim_id, + LLVMConstInt(ctx->i32, 3, 0), + LLVMConstInt(ctx->i32, i, 0)); + } + break; + case PIPE_PRIM_TRIANGLE_STRIP: + for (unsigned i = 0; i < 3; i++) { + index[i] = LLVMBuildAdd(builder, prim_id, + LLVMConstInt(ctx->i32, i, 0), ""); + } + break; + case PIPE_PRIM_TRIANGLE_FAN: + /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper + * and rasterizer as a normal triangle, so we need to put the provoking + * vertex into the correct index variable and preserve orientation at the same time. + * gl_VertexID is preserved, because it's equal to the index. + */ + if (key->opt.cs_provoking_vertex_first) { + index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), ""); + index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), ""); + index[2] = ctx->i32_0; + } else { + index[0] = ctx->i32_0; + index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), ""); + index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), ""); + } + break; + default: + unreachable("unexpected primitive type"); + } + + /* Fetch indices. */ + if (key->opt.cs_indexed) { + for (unsigned i = 0; i < 3; i++) { + index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, + index[i], ctx->i32_0, 1, + false, true); + index[i] = ac_to_integer(&ctx->ac, index[i]); + } + } + + /* Extract the ordered wave ID. */ + if (VERTEX_COUNTER_GDS_MODE == 2) { + ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id, + LLVMConstInt(ctx->i32, 6, 0), ""); + ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id, + LLVMConstInt(ctx->i32, 0xfff, 0), ""); + } + LLVMValueRef thread_id = + LLVMBuildAnd(builder, local_id, LLVMConstInt(ctx->i32, 63, 0), ""); + + /* Every other triangle in a strip has a reversed vertex order, so we + * need to swap vertices of odd primitives to get the correct primitive + * orientation when converting triangle strips to triangles. Primitive + * restart complicates it, because a strip can start anywhere. + */ + LLVMValueRef prim_restart_accepted = ctx->i1true; + + if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) { + /* Without primitive restart, odd primitives have reversed orientation. + * Only primitive restart can flip it with respect to the first vertex + * of the draw call. + */ + LLVMValueRef first_is_odd = ctx->i1false; + + /* Handle primitive restart. */ + if (key->opt.cs_primitive_restart) { + /* Get the GDS primitive restart continue flag and clear + * the flag in vertex_counter. This flag is used when the draw + * call was split and we need to load the primitive orientation + * flag from GDS for the first wave too. + */ + LLVMValueRef gds_prim_restart_continue = + LLVMBuildLShr(builder, vertex_counter, + LLVMConstInt(ctx->i32, 31, 0), ""); + gds_prim_restart_continue = + LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->i1, ""); + vertex_counter = LLVMBuildAnd(builder, vertex_counter, + LLVMConstInt(ctx->i32, 0x7fffffff, 0), ""); + + LLVMValueRef index0_is_reset; + + for (unsigned i = 0; i < 3; i++) { + LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i], + restart_index, ""); + if (i == 0) + index0_is_reset = LLVMBuildNot(builder, not_reset, ""); + prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, + not_reset, ""); + } + + /* If the previous waves flip the primitive orientation + * of the current triangle strip, it will be stored in GDS. + * + * Sometimes the correct orientation is not needed, in which case + * we don't need to execute this. + */ + if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) { + /* If there are reset indices in this wave, get the thread index + * where the most recent strip starts relative to each thread. + */ + LLVMValueRef preceding_threads_mask = + LLVMBuildSub(builder, + LLVMBuildShl(builder, ctx->ac.i64_1, + LLVMBuildZExt(builder, thread_id, ctx->i64, ""), ""), + ctx->ac.i64_1, ""); + + LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset); + LLVMValueRef preceding_reset_threadmask = + LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, ""); + LLVMValueRef strip_start = + ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL); + strip_start = LLVMBuildAdd(builder, strip_start, ctx->i32_1, ""); + + /* This flips the orientatino based on reset indices within this wave only. */ + first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->i1, ""); + + LLVMValueRef last_strip_start, prev_wave_state, ret, tmp; + LLVMValueRef is_first_wave, current_wave_resets_index; + + /* Get the thread index where the last strip starts in this wave. + * + * If the last strip doesn't start in this wave, the thread index + * will be 0. + * + * If the last strip starts in the next wave, the thread index will + * be 64. + */ + last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL); + last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->i32_1, ""); + + struct si_thread0_section section; + si_enter_thread0_section(ctx, §ion, thread_id); + + /* This must be done in the thread 0 section, because + * we expect PrimID to be 0 for the whole first wave + * in this expression. + * + * NOTE: This will need to be different if we wanna support + * instancing with primitive restart. + */ + is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->i32_0, ""); + is_first_wave = LLVMBuildAnd(builder, is_first_wave, + LLVMBuildNot(builder, + gds_prim_restart_continue, ""), ""); + current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE, + last_strip_start, ctx->i32_0, ""); + + ret = ac_build_alloca_undef(&ctx->ac, ctx->i32, "prev_state"); + + /* Save the last strip start primitive index in GDS and read + * the value that previous waves stored. + * + * if (is_first_wave || current_wave_resets_strip) + * // Read the value that previous waves stored and store a new one. + * first_is_odd = ds.ordered.swap(last_strip_start); + * else + * // Just read the value that previous waves stored. + * first_is_odd = ds.ordered.add(0); + */ + struct lp_build_if_state if_overwrite_counter; + lp_build_if(&if_overwrite_counter, &ctx->gallivm, + LLVMBuildOr(builder, is_first_wave, + current_wave_resets_index, "")); + { + /* The GDS address is always 0 with ordered append. */ + tmp = si_build_ds_ordered_op(ctx, "swap", + ordered_wave_id, last_strip_start, + 1, true, false); + LLVMBuildStore(builder, tmp, ret); + } + lp_build_else(&if_overwrite_counter); + { + /* Just read the value from GDS. */ + tmp = si_build_ds_ordered_op(ctx, "add", + ordered_wave_id, ctx->i32_0, + 1, true, false); + LLVMBuildStore(builder, tmp, ret); + } + lp_build_endif(&if_overwrite_counter); + + prev_wave_state = LLVMBuildLoad(builder, ret, ""); + /* Ignore the return value if this is the first wave. */ + prev_wave_state = LLVMBuildSelect(builder, is_first_wave, + ctx->i32_0, prev_wave_state, ""); + si_exit_thread0_section(§ion, &prev_wave_state); + prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->i1, ""); + + /* If the strip start appears to be on thread 0 for the current primitive + * (meaning the reset index is not present in this wave and might have + * appeared in previous waves), use the value from GDS to determine + * primitive orientation. + * + * If the strip start is in this wave for the current primitive, use + * the value from the current wave to determine primitive orientation. + */ + LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ, + strip_start, ctx->i32_0, ""); + first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, + first_is_odd, ""); + } + } + /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */ + LLVMValueRef prim_is_odd = + LLVMBuildXor(builder, first_is_odd, + LLVMBuildTrunc(builder, thread_id, ctx->i1, ""), ""); + + /* Determine the primitive orientation. + * Only swap the vertices that are not the provoking vertex. We need to keep + * the provoking vertex in place. + */ + if (key->opt.cs_provoking_vertex_first) { + LLVMValueRef index1 = index[1]; + LLVMValueRef index2 = index[2]; + index[1] = LLVMBuildSelect(builder, prim_is_odd, index2, index1, ""); + index[2] = LLVMBuildSelect(builder, prim_is_odd, index1, index2, ""); + } else { + LLVMValueRef index0 = index[0]; + LLVMValueRef index1 = index[1]; + index[0] = LLVMBuildSelect(builder, prim_is_odd, index1, index0, ""); + index[1] = LLVMBuildSelect(builder, prim_is_odd, index0, index1, ""); + } + } + + /* Execute the vertex shader for each vertex to get vertex positions. */ + LLVMValueRef pos[3][4]; + for (unsigned i = 0; i < vertices_per_prim; i++) { + vs_params[param_vertex_id] = index[i]; + vs_params[param_instance_id] = instance_id; + + LLVMValueRef ret = LLVMBuildCall(builder, vs, vs_params, num_vs_params, ""); + for (unsigned chan = 0; chan < 4; chan++) + pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, ""); + } + + /* Divide XYZ by W. */ + for (unsigned i = 0; i < vertices_per_prim; i++) { + for (unsigned chan = 0; chan < 3; chan++) + pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]); + } + + /* Load the viewport state. */ + LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants, + LLVMConstInt(ctx->i32, 2, 0)); + vp = LLVMBuildBitCast(builder, vp, ctx->v4f32, ""); + vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0); + vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1); + vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2); + vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3); + + /* Do culling. */ + struct ac_cull_options options = {}; + options.cull_front = key->opt.cs_cull_front; + options.cull_back = key->opt.cs_cull_back; + options.cull_view_xy = true; + options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z; + options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z; + options.cull_small_prims = true; + options.cull_zero_area = true; + options.cull_w = true; + options.use_halfz_clip_space = key->opt.cs_halfz_clip_space; + + LLVMValueRef accepted = + ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, + vp_scale, vp_translate, smallprim_precision, + &options); + + LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted); + + /* Count the number of active threads by doing bitcount(accepted). */ + LLVMValueRef num_prims_accepted = + ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->i64, + &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE); + num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->i32, ""); + + LLVMValueRef start; + + /* Execute atomic_add on the vertex count. */ + struct si_thread0_section section; + si_enter_thread0_section(ctx, §ion, thread_id); + { + if (VERTEX_COUNTER_GDS_MODE == 0) { + LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted, + LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); + vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter); + start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, + vertex_counter, num_indices, + LLVMAtomicOrderingMonotonic, false); + } else if (VERTEX_COUNTER_GDS_MODE == 1) { + LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted, + LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); + vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter, + LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""); + start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, + vertex_counter, num_indices, + LLVMAtomicOrderingMonotonic, false); + } else if (VERTEX_COUNTER_GDS_MODE == 2) { + LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->i32, ""); + + /* If the draw call was split into multiple subdraws, each using + * a separate draw packet, we need to start counting from 0 for + * the first compute wave of the subdraw. + * + * vertex_counter contains the primitive ID of the first thread + * in the first wave. + * + * This is only correct with VERTEX_COUNTER_GDS_MODE == 2: + */ + LLVMValueRef is_first_wave = + LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, + vertex_counter, ""); + + /* Store the primitive count for ordered append, not vertex count. + * The idea is to avoid GDS initialization via CP DMA. The shader + * effectively stores the first count using "swap". + * + * if (first_wave) { + * ds.ordered.swap(num_prims_accepted); // store the first primitive count + * previous = 0; + * } else { + * previous = ds.ordered.add(num_prims_accepted) // add the primitive count + * } + */ + struct lp_build_if_state if_first_wave; + lp_build_if(&if_first_wave, &ctx->gallivm, is_first_wave); + { + /* The GDS address is always 0 with ordered append. */ + si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, + num_prims_accepted, 0, true, true); + LLVMBuildStore(builder, ctx->i32_0, tmp_store); + } + lp_build_else(&if_first_wave); + { + LLVMBuildStore(builder, + si_build_ds_ordered_op(ctx, "add", ordered_wave_id, + num_prims_accepted, 0, + true, true), + tmp_store); + } + lp_build_endif(&if_first_wave); + + start = LLVMBuildLoad(builder, tmp_store, ""); + } + } + si_exit_thread0_section(§ion, &start); + + /* Write the final vertex count to memory. An EOS/EOP event could do this, + * but those events are super slow and should be avoided if performance + * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE + * event like this. + */ + if (VERTEX_COUNTER_GDS_MODE == 2) { + struct lp_build_if_state if_last_wave; + lp_build_if(&if_last_wave, &ctx->gallivm, + LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, + last_wave_prim_id, "")); + LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, ""); + count = LLVMBuildMul(builder, count, + LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); + + /* VI needs to disable caching, so that the CP can see the stored value. + * MTYPE=3 bypasses TC L2. + */ + if (ctx->screen->info.chip_class <= GFX8) { + LLVMValueRef desc[] = { + vertex_count_addr, + LLVMConstInt(ctx->i32, + S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0), + LLVMConstInt(ctx->i32, 4, 0), + LLVMConstInt(ctx->i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_MTYPE(3 /* uncached */), 0), + }; + LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4); + ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->i32_0, + ctx->i32_0, 0, true, true, true, false); + } else { + LLVMBuildStore(builder, count, + si_expand_32bit_pointer(ctx, vertex_count_addr)); + } + lp_build_endif(&if_last_wave); + } else { + /* For unordered modes that increment a vertex count instead of + * primitive count, convert it into the primitive index. + */ + start = LLVMBuildUDiv(builder, start, + LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); + } + + /* Now we need to store the indices of accepted primitives into + * the output index buffer. + */ + struct lp_build_if_state if_accepted; + lp_build_if(&if_accepted, &ctx->gallivm, accepted); + { + /* Get the number of bits set before the index of this thread. */ + LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask); + + /* We have lowered instancing. Pack the instance ID into vertex ID. */ + if (key->opt.cs_instancing) { + instance_id = LLVMBuildShl(builder, instance_id, + LLVMConstInt(ctx->i32, 16, 0), ""); + + for (unsigned i = 0; i < vertices_per_prim; i++) + index[i] = LLVMBuildOr(builder, index[i], instance_id, ""); + } + + if (VERTEX_COUNTER_GDS_MODE == 2) { + /* vertex_counter contains the first primitive ID + * for this dispatch. If the draw call was split into + * multiple subdraws, the first primitive ID is > 0 + * for subsequent subdraws. Each subdraw uses a different + * portion of the output index buffer. Offset the store + * vindex by the first primitive ID to get the correct + * store address for the subdraw. + */ + start = LLVMBuildAdd(builder, start, vertex_counter, ""); + } + + /* Write indices for accepted primitives. */ + LLVMValueRef buf_args[] = { + ac_to_float(&ctx->ac, ac_build_expand_to_vec4(&ctx->ac, + ac_build_gather_values(&ctx->ac, index, 3), 3)), + output_indexbuf, + LLVMBuildAdd(builder, start, prim_index, ""), + ctx->i32_0, /* voffset */ + ctx->i1true, /* glc */ + LLVMConstInt(ctx->i1, INDEX_STORES_USE_SLC, 0), + }; + ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.buffer.store.format.v4f32", + ctx->voidt, buf_args, 6, + ac_get_store_intr_attribs(true)); + } + lp_build_endif(&if_accepted); + + LLVMBuildRetVoid(builder); +} + +/* Return false if the shader isn't ready. */ +static bool si_shader_select_prim_discard_cs(struct si_context *sctx, + const struct pipe_draw_info *info) +{ + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + struct si_shader_key key; + + /* Primitive restart needs ordered counters. */ + assert(!info->primitive_restart || VERTEX_COUNTER_GDS_MODE == 2); + assert(!info->primitive_restart || info->instance_count == 1); + + memset(&key, 0, sizeof(key)); + si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog); + assert(!key.part.vs.prolog.instance_divisor_is_fetched); + + key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0; + key.opt.vs_as_prim_discard_cs = 1; + key.opt.cs_prim_type = info->mode; + key.opt.cs_indexed = info->index_size != 0; + key.opt.cs_instancing = info->instance_count > 1; + key.opt.cs_primitive_restart = info->primitive_restart; + key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first; + + /* Primitive restart with triangle strips needs to preserve primitive + * orientation for cases where front and back primitive orientation matters. + */ + if (info->primitive_restart) { + struct si_shader_selector *ps = sctx->ps_shader.cso; + + key.opt.cs_need_correct_orientation = + rs->cull_front != rs->cull_back || + ps->info.uses_frontface || + (rs->two_side && ps->info.colors_read); + } + + if (rs->rasterizer_discard) { + /* Just for performance testing and analysis of trivial bottlenecks. + * This should result in a very short compute shader. */ + key.opt.cs_cull_front = 1; + key.opt.cs_cull_back = 1; + } else { + key.opt.cs_cull_front = + sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front; + key.opt.cs_cull_back = + sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back; + } + + if (!rs->depth_clamp_any && CULL_Z) { + key.opt.cs_cull_z = 1; + key.opt.cs_halfz_clip_space = rs->clip_halfz; + } + + sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso; + sctx->cs_prim_discard_state.current = NULL; + + struct si_compiler_ctx_state compiler_state; + compiler_state.compiler = &sctx->compiler; + compiler_state.debug = sctx->debug; + compiler_state.is_debug_context = sctx->is_debug; + + return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, + &compiler_state, &key, -1, true) == 0 && + /* Disallow compute shaders using the scratch buffer. */ + sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0; +} + +static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx) +{ + if (sctx->index_ring) + return true; + + if (!sctx->prim_discard_compute_cs) { + struct radeon_winsys *ws = sctx->ws; + unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED : + VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0; + unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0; + + if (gds_size) { + sctx->gds = ws->buffer_create(ws, gds_size, 4, + RADEON_DOMAIN_GDS, 0); + if (!sctx->gds) + return false; + + ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, + RADEON_USAGE_READWRITE, 0, 0); + } + if (num_oa_counters) { + assert(gds_size); + sctx->gds_oa = ws->buffer_create(ws, num_oa_counters, + 1, RADEON_DOMAIN_OA, 0); + if (!sctx->gds_oa) + return false; + + ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, + RADEON_USAGE_READWRITE, 0, 0); + } + + sctx->prim_discard_compute_cs = + ws->cs_add_parallel_compute_ib(sctx->gfx_cs, + num_oa_counters > 0); + if (!sctx->prim_discard_compute_cs) + return false; + } + + if (!sctx->index_ring) { + sctx->index_ring = + si_aligned_buffer_create(sctx->b.screen, + SI_RESOURCE_FLAG_UNMAPPABLE, + PIPE_USAGE_DEFAULT, + sctx->index_ring_size_per_ib * 2, + 2 * 1024 * 1024); + if (!sctx->index_ring) + return false; + } + return true; +} + +static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size) +{ + return sctx->index_ring_offset + + align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <= + sctx->index_ring_size_per_ib; +} + +enum si_prim_discard_outcome +si_prepare_prim_discard_or_split_draw(struct si_context *sctx, + const struct pipe_draw_info *info) +{ + /* If the compute shader compilation isn't finished, this returns false. */ + if (!si_shader_select_prim_discard_cs(sctx, info)) + return SI_PRIM_DISCARD_DISABLED; + + if (!si_initialize_prim_discard_cmdbuf(sctx)) + return SI_PRIM_DISCARD_DISABLED; + + struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs; + unsigned prim = info->mode; + unsigned count = info->count; + unsigned instance_count = info->instance_count; + unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count); + unsigned num_prims = num_prims_per_instance * instance_count; + unsigned out_indexbuf_size = num_prims * 12; + bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size); + const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL; + + /* Split draws at the draw call level if the ring is full. This makes + * better use of the ring space. + */ + if (ring_full && + num_prims > split_prims_draw_level && + instance_count == 1 && /* TODO: support splitting instanced draws */ + (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | + (1 << PIPE_PRIM_TRIANGLE_STRIP))) { + /* Split draws. */ + struct pipe_draw_info split_draw = *info; + unsigned base_start = split_draw.start; + + if (prim == PIPE_PRIM_TRIANGLES) { + unsigned vert_count_per_subdraw = split_prims_draw_level * 3; + assert(vert_count_per_subdraw < count); + + for (unsigned start = 0; start < count; start += vert_count_per_subdraw) { + split_draw.start = base_start + start; + split_draw.count = MIN2(count - start, vert_count_per_subdraw); + + sctx->b.draw_vbo(&sctx->b, &split_draw); + } + } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) { + /* No primitive pair can be split, because strips reverse orientation + * for odd primitives. */ + STATIC_ASSERT(split_prims_draw_level % 2 == 0); + + unsigned vert_count_per_subdraw = split_prims_draw_level; + + for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) { + split_draw.start = base_start + start; + split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2); + + sctx->b.draw_vbo(&sctx->b, &split_draw); + + if (start == 0 && + split_draw.primitive_restart && + sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation) + sctx->preserve_prim_restart_gds_at_flush = true; + } + sctx->preserve_prim_restart_gds_at_flush = false; + } else { + assert(0); + } + + return SI_PRIM_DISCARD_DRAW_SPLIT; + } + + /* Just quit if the draw call doesn't fit into the ring and can't be split. */ + if (out_indexbuf_size > sctx->index_ring_size_per_ib) { + if (SI_PRIM_DISCARD_DEBUG) + puts("PD failed: draw call too big, can't be split"); + return SI_PRIM_DISCARD_DISABLED; + } + + unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL); + unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ + + 24 * (num_subdraws - 1) + /* subdraws */ + 20; /* leave some space at the end */ + unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx); + + if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) + need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */ + else + need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */ + + if (ring_full || + (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) || + !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) { + /* If the current IB is empty but the size is too small, add a NOP + * packet to force a flush and get a bigger IB. + */ + if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) && + gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) { + radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(gfx_cs, 0); + } + + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + } + + /* The compute IB is always chained, but we need to call cs_check_space to add more space. */ + struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; + bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false); + assert(compute_has_space); + assert(si_check_ring_space(sctx, out_indexbuf_size)); + return SI_PRIM_DISCARD_ENABLED; +} + +void si_compute_signal_gfx(struct si_context *sctx) +{ + struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; + unsigned writeback_L2_flags = 0; + + /* The writeback L2 flags vary with each chip generation. */ + /* CI needs to flush vertex indices to memory. */ + if (sctx->chip_class <= GFX7) + writeback_L2_flags = EVENT_TC_WB_ACTION_ENA; + else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0) + writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA; + + if (!sctx->compute_num_prims_in_batch) + return; + + assert(sctx->compute_rewind_va); + + /* After the queued dispatches are done and vertex counts are written to + * the gfx IB, signal the gfx IB to continue. CP doesn't wait for + * the dispatches to finish, it only adds the CS_DONE event into the event + * queue. + */ + si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags, + sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2, + writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : + EOP_INT_SEL_NONE, + EOP_DATA_SEL_VALUE_32BIT, + NULL, + sctx->compute_rewind_va | + ((uint64_t)sctx->screen->info.address32_hi << 32), + REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */ + SI_NOT_QUERY); + + sctx->compute_rewind_va = 0; + sctx->compute_num_prims_in_batch = 0; +} + +/* Dispatch a primitive discard compute shader. */ +void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, + const struct pipe_draw_info *info, + unsigned index_size, + unsigned base_vertex, + uint64_t input_indexbuf_va, + unsigned input_indexbuf_num_elements) +{ + struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs; + struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; + unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count); + if (!num_prims_per_instance) + return; + + unsigned num_prims = num_prims_per_instance * info->instance_count; + unsigned vertices_per_prim, output_indexbuf_format; + + switch (info->mode) { + case PIPE_PRIM_TRIANGLES: + case PIPE_PRIM_TRIANGLE_STRIP: + case PIPE_PRIM_TRIANGLE_FAN: + vertices_per_prim = 3; + output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32; + break; + default: + unreachable("unsupported primitive type"); + return; + } + + unsigned out_indexbuf_offset; + uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4; + bool first_dispatch = !sctx->prim_discard_compute_ib_initialized; + + /* Initialize the compute IB if it's empty. */ + if (!sctx->prim_discard_compute_ib_initialized) { + /* 1) State initialization. */ + sctx->compute_gds_offset = 0; + sctx->compute_ib_last_shader = NULL; + + if (sctx->last_ib_barrier_fence) { + assert(!sctx->last_ib_barrier_buf); + sctx->ws->cs_add_fence_dependency(gfx_cs, + sctx->last_ib_barrier_fence, + RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY); + } + + /* 2) IB initialization. */ + /* Restore the GDS prim restart counter if needed. */ + if (sctx->preserve_prim_restart_gds_at_flush) { + si_cp_copy_data(sctx, cs, + COPY_DATA_GDS, NULL, 4, + COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4); + } + + si_emit_initial_compute_regs(sctx, cs); + + radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE, + S_00B860_WAVES(sctx->scratch_waves) | + S_00B860_WAVESIZE(0)); /* no scratch */ + + /* Only 1D grids are launched. */ + radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2); + radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | + S_00B820_NUM_THREAD_PARTIAL(1)); + radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | + S_00B824_NUM_THREAD_PARTIAL(1)); + + radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2); + radeon_emit(cs, 0); + radeon_emit(cs, 0); + + /* Disable ordered alloc for OA resources. */ + for (unsigned i = 0; i < 2; i++) { + radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3); + radeon_emit(cs, S_031074_INDEX(i)); + radeon_emit(cs, 0); + radeon_emit(cs, S_03107C_ENABLE(0)); + } + + if (sctx->last_ib_barrier_buf) { + assert(!sctx->last_ib_barrier_fence); + radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, + RADEON_USAGE_READ, RADEON_PRIO_FENCE); + si_cp_wait_mem(sctx, cs, + sctx->last_ib_barrier_buf->gpu_address + + sctx->last_ib_barrier_buf_offset, 1, 1, + WAIT_REG_MEM_EQUAL); + } + + sctx->prim_discard_compute_ib_initialized = true; + } + + /* Allocate the output index buffer. */ + output_indexbuf_size = align(output_indexbuf_size, + sctx->screen->info.tcc_cache_line_size); + assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib); + out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset; + sctx->index_ring_offset += output_indexbuf_size; + + radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE, + RADEON_PRIO_SHADER_RW_BUFFER); + uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset; + + /* Prepare index buffer descriptors. */ + struct si_resource *indexbuf_desc = NULL; + unsigned indexbuf_desc_offset; + unsigned desc_size = 12 * 4; + uint32_t *desc; + + u_upload_alloc(sctx->b.const_uploader, 0, desc_size, + si_optimal_tcc_alignment(sctx, desc_size), + &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc, + (void**)&desc); + radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); + + /* Input index buffer. */ + desc[0] = input_indexbuf_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | + S_008F04_STRIDE(index_size); + desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1); + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | + S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 : + index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 : + V_008F0C_BUF_DATA_FORMAT_32); + + /* Output index buffer. */ + desc[4] = out_indexbuf_va; + desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | + S_008F04_STRIDE(vertices_per_prim * 4); + desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1); + desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | + S_008F0C_DATA_FORMAT(output_indexbuf_format); + + /* Viewport state. + * This is needed by the small primitive culling, because it's done + * in screen space. + */ + float scale[2], translate[2]; + + scale[0] = sctx->viewports.states[0].scale[0]; + scale[1] = sctx->viewports.states[0].scale[1]; + translate[0] = sctx->viewports.states[0].translate[0]; + translate[1] = sctx->viewports.states[0].translate[1]; + + /* The viewport shouldn't flip the X axis for the small prim culling to work. */ + assert(-scale[0] + translate[0] <= scale[0] + translate[0]); + + /* If the Y axis is inverted (OpenGL default framebuffer), reverse it. + * This is because the viewport transformation inverts the clip space + * bounding box, so min becomes max, which breaks small primitive + * culling. + */ + if (sctx->viewports.y_inverted) { + scale[1] = -scale[1]; + translate[1] = -translate[1]; + } + + /* Scale the framebuffer up, so that samples become pixels and small + * primitive culling is the same for all sample counts. + * This only works with the standard DX sample positions, because + * the samples are evenly spaced on both X and Y axes. + */ + unsigned num_samples = sctx->framebuffer.nr_samples; + assert(num_samples >= 1); + + for (unsigned i = 0; i < 2; i++) { + scale[i] *= num_samples; + translate[i] *= num_samples; + } + + desc[8] = fui(scale[0]); + desc[9] = fui(scale[1]); + desc[10] = fui(translate[0]); + desc[11] = fui(translate[1]); + + /* Better subpixel precision increases the efficiency of small + * primitive culling. */ + unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode; + float small_prim_cull_precision; + + if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH) + small_prim_cull_precision = num_samples / 4096.0; + else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH) + small_prim_cull_precision = num_samples / 1024.0; + else + small_prim_cull_precision = num_samples / 256.0; + + /* Set user data SGPRs. */ + /* This can't be greater than 14 if we want the fastest launch rate. */ + unsigned user_sgprs = 13; + + uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset; + unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX); + unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX); + uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address; + uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address; + uint64_t vb_desc_va = sctx->vb_descriptors_buffer ? + sctx->vb_descriptors_buffer->gpu_address + + sctx->vb_descriptors_offset : 0; + unsigned gds_offset, gds_size; + struct si_fast_udiv_info32 num_prims_udiv = {}; + + if (info->instance_count > 1) + num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31); + + /* Limitations on how these two are packed in the user SGPR. */ + assert(num_prims_udiv.post_shift < 32); + assert(num_prims_per_instance < 1 << 27); + + si_resource_reference(&indexbuf_desc, NULL); + + if (VERTEX_COUNTER_GDS_MODE == 1) { + gds_offset = sctx->compute_gds_offset; + gds_size = info->primitive_restart ? 8 : 4; + sctx->compute_gds_offset += gds_size; + + /* Reset the counters in GDS for the first dispatch using WRITE_DATA. + * The remainder of the GDS will be cleared after the dispatch packet + * in parallel with compute shaders. + */ + if (first_dispatch) { + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0)); + radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1)); + radeon_emit(cs, gds_offset); + radeon_emit(cs, 0); + radeon_emit(cs, 0); /* value to write */ + if (gds_size == 8) + radeon_emit(cs, 0); + } + } + + /* Set shader registers. */ + struct si_shader *shader = sctx->cs_prim_discard_state.current; + + if (shader != sctx->compute_ib_last_shader) { + radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ, + RADEON_PRIO_SHADER_BINARY); + uint64_t shader_va = shader->bo->gpu_address; + + assert(shader->config.scratch_bytes_per_wave == 0); + assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4); + + radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2); + radeon_emit(cs, shader_va >> 8); + radeon_emit(cs, S_00B834_DATA(shader_va >> 40)); + + radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2); + radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) | + S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8) | + S_00B848_FLOAT_MODE(shader->config.float_mode) | + S_00B848_DX10_CLAMP(1)); + radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | + S_00B84C_USER_SGPR(user_sgprs) | + S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) | + S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) | + S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) | + S_00B84C_LDS_SIZE(shader->config.lds_size)); + + radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, + si_get_compute_resource_limits(sctx->screen, WAVES_PER_TG, + MAX_WAVES_PER_SH, THREADGROUPS_PER_CU)); + sctx->compute_ib_last_shader = shader; + } + + STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0); + + /* Big draw calls are split into smaller dispatches and draw packets. */ + for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) { + unsigned num_subdraw_prims; + + if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims) + num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL; + else + num_subdraw_prims = num_prims - start_prim; + + /* Small dispatches are executed back to back until a specific primitive + * count is reached. Then, a CS_DONE is inserted to signal the gfx IB + * to start drawing the batch. This batching adds latency to the gfx IB, + * but CS_DONE and REWIND are too slow. + */ + if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH) + si_compute_signal_gfx(sctx); + + if (sctx->compute_num_prims_in_batch == 0) { + assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi); + sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4; + + if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) { + radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(gfx_cs, 0); + + si_cp_wait_mem(sctx, gfx_cs, + sctx->compute_rewind_va | + (uint64_t)sctx->screen->info.address32_hi << 32, + REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT, + WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP); + + /* Use INDIRECT_BUFFER to chain to a different buffer + * to discard the CP prefetch cache. + */ + sctx->ws->cs_check_space(gfx_cs, 0, true); + } else { + radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0)); + radeon_emit(gfx_cs, 0); + } + } + + sctx->compute_num_prims_in_batch += num_subdraw_prims; + + uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4; + uint64_t index_va = out_indexbuf_va + start_prim * 12; + + /* Emit the draw packet into the gfx IB. */ + radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0)); + radeon_emit(gfx_cs, num_prims * vertices_per_prim); + radeon_emit(gfx_cs, index_va); + radeon_emit(gfx_cs, index_va >> 32); + radeon_emit(gfx_cs, 0); + radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA); + + /* Continue with the compute IB. */ + if (start_prim == 0) { + uint32_t gds_prim_restart_continue_bit = 0; + + if (sctx->preserve_prim_restart_gds_at_flush) { + assert(info->primitive_restart && + info->mode == PIPE_PRIM_TRIANGLE_STRIP); + assert(start_prim < 1 << 31); + gds_prim_restart_continue_bit = 1 << 31; + } + + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs); + radeon_emit(cs, index_buffers_va); + radeon_emit(cs, + VERTEX_COUNTER_GDS_MODE == 0 ? count_va : + VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset : + start_prim | + gds_prim_restart_continue_bit); + radeon_emit(cs, start_prim + num_subdraw_prims - 1); + radeon_emit(cs, count_va); + radeon_emit(cs, vb_desc_va); + radeon_emit(cs, vs_const_desc_va); + radeon_emit(cs, vs_sampler_desc_va); + radeon_emit(cs, base_vertex); + radeon_emit(cs, info->start_instance); + radeon_emit(cs, num_prims_udiv.multiplier); + radeon_emit(cs, num_prims_udiv.post_shift | + (num_prims_per_instance << 5)); + radeon_emit(cs, info->restart_index); + /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */ + radeon_emit(cs, fui(small_prim_cull_precision)); + } else { + assert(VERTEX_COUNTER_GDS_MODE == 2); + /* Only update the SGPRs that changed. */ + radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3); + radeon_emit(cs, start_prim); + radeon_emit(cs, start_prim + num_subdraw_prims - 1); + radeon_emit(cs, count_va); + } + + /* Set grid dimensions. */ + unsigned start_block = start_prim / THREADGROUP_SIZE; + unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE; + unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE; + + radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block); + radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X, + S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) | + S_00B81C_NUM_THREAD_PARTIAL(partial_block_size)); + + radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | + PKT3_SHADER_TYPE_S(1)); + radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size); + radeon_emit(cs, 1); + radeon_emit(cs, 1); + radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | + S_00B800_PARTIAL_TG_EN(!!partial_block_size) | + S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) | + S_00B800_ORDER_MODE(0 /* launch in order */)); + + /* This is only for unordered append. Ordered append writes this from + * the shader. + * + * Note that EOP and EOS events are super slow, so emulating the event + * in a shader is an important optimization. + */ + if (VERTEX_COUNTER_GDS_MODE == 1) { + si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0, + sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2, + EOP_INT_SEL_NONE, + EOP_DATA_SEL_GDS, + NULL, + count_va | ((uint64_t)sctx->screen->info.address32_hi << 32), + EOP_DATA_GDS(gds_offset / 4, 1), + SI_NOT_QUERY); + + /* Now that compute shaders are running, clear the remainder of GDS. */ + if (first_dispatch) { + unsigned offset = gds_offset + gds_size; + si_cp_dma_clear_buffer(sctx, cs, NULL, offset, + GDS_SIZE_UNORDERED - offset, + 0, + SI_CPDMA_SKIP_CHECK_CS_SPACE | + SI_CPDMA_SKIP_GFX_SYNC | + SI_CPDMA_SKIP_SYNC_BEFORE, + SI_COHERENCY_NONE, L2_BYPASS); + } + } + first_dispatch = false; + + assert(cs->current.cdw <= cs->current.max_dw); + assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw); + } +} diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 37ab31a410b..e83016fc531 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -249,8 +249,10 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, sdst->TC_L2_dirty = true; /* If it's not a framebuffer fast clear... */ - if (coher == SI_COHERENCY_SHADER) + if (coher == SI_COHERENCY_SHADER) { sctx->num_cp_dma_calls++; + si_prim_discard_signal_next_compute_ib_start(sctx); + } } /** @@ -405,8 +407,10 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, si_resource(dst)->TC_L2_dirty = true; /* If it's not a prefetch or GDS copy... */ - if (dst && src && (dst != src || dst_offset != src_offset)) + if (dst && src && (dst != src || dst_offset != src_offset)) { sctx->num_cp_dma_calls++; + si_prim_discard_signal_next_compute_ib_start(sctx); + } } void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c index bd85fc49387..fc2d731fa22 100644 --- a/src/gallium/drivers/radeonsi/si_debug.c +++ b/src/gallium/drivers/radeonsi/si_debug.c @@ -337,6 +337,7 @@ struct si_log_chunk_cs { struct si_saved_cs *cs; bool dump_bo_list; unsigned gfx_begin, gfx_end; + unsigned compute_begin, compute_end; }; static void si_log_chunk_type_cs_destroy(void *data) @@ -394,6 +395,7 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) struct si_context *ctx = chunk->ctx; struct si_saved_cs *scs = chunk->cs; int last_trace_id = -1; + int last_compute_trace_id = -1; /* We are expecting that the ddebug pipe has already * waited for the context, so this buffer should be idle. @@ -403,8 +405,10 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) NULL, PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_READ); - if (map) + if (map) { last_trace_id = map[0]; + last_compute_trace_id = map[1]; + } if (chunk->gfx_end != chunk->gfx_begin) { if (chunk->gfx_begin == 0) { @@ -432,6 +436,21 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) } } + if (chunk->compute_end != chunk->compute_begin) { + assert(ctx->prim_discard_compute_cs); + + if (scs->flushed) { + ac_parse_ib(f, scs->compute.ib + chunk->compute_begin, + chunk->compute_end - chunk->compute_begin, + &last_compute_trace_id, map ? 1 : 0, "Compute IB", ctx->chip_class, + NULL, NULL); + } else { + si_parse_current_ib(f, ctx->prim_discard_compute_cs, chunk->compute_begin, + chunk->compute_end, &last_compute_trace_id, + map ? 1 : 0, "Compute IB", ctx->chip_class); + } + } + if (chunk->dump_bo_list) { fprintf(f, "Flushing. Time: "); util_dump_ns(f, scs->time_flush); @@ -452,9 +471,14 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, struct si_saved_cs *scs = ctx->current_saved_cs; unsigned gfx_cur = ctx->gfx_cs->prev_dw + ctx->gfx_cs->current.cdw; + unsigned compute_cur = 0; + + if (ctx->prim_discard_compute_cs) + compute_cur = ctx->prim_discard_compute_cs->prev_dw + ctx->prim_discard_compute_cs->current.cdw; if (!dump_bo_list && - gfx_cur == scs->gfx_last_dw) + gfx_cur == scs->gfx_last_dw && + compute_cur == scs->compute_last_dw) return; struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk)); @@ -467,6 +491,10 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, chunk->gfx_end = gfx_cur; scs->gfx_last_dw = gfx_cur; + chunk->compute_begin = scs->compute_last_dw; + chunk->compute_end = compute_cur; + scs->compute_last_dw = compute_cur; + u_log_chunk(log, &si_log_chunk_type_cs, chunk); } diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c index fdd10d8bdef..1d67fd87b90 100644 --- a/src/gallium/drivers/radeonsi/si_fence.c +++ b/src/gallium/drivers/radeonsi/si_fence.c @@ -80,7 +80,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, EOP_INT_SEL(int_sel) | EOP_DATA_SEL(data_sel); - if (ctx->chip_class >= GFX9) { + if (ctx->chip_class >= GFX9 || cs == ctx->prim_discard_compute_cs) { /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion * counters) must immediately precede every timestamp event to * prevent a GPU hang on GFX9. @@ -89,6 +89,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, * always do ZPASS_DONE before the timestamp. */ if (ctx->chip_class == GFX9 && + cs != ctx->prim_discard_compute_cs && query_type != PIPE_QUERY_OCCLUSION_COUNTER && query_type != PIPE_QUERY_OCCLUSION_PREDICATE && query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) { @@ -105,14 +106,15 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); } - radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0)); + radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0)); radeon_emit(cs, op); radeon_emit(cs, sel); radeon_emit(cs, va); /* address lo */ radeon_emit(cs, va >> 32); /* address hi */ radeon_emit(cs, new_fence); /* immediate data lo */ radeon_emit(cs, 0); /* immediate data hi */ - radeon_emit(cs, 0); /* unused */ + if (ctx->chip_class >= GFX9) + radeon_emit(cs, 0); /* unused */ } else { if (ctx->chip_class == GFX7 || ctx->chip_class == GFX8) { diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 121ab75c08b..de0909904c8 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -24,6 +24,8 @@ */ #include "si_pipe.h" +#include "si_build_pm4.h" +#include "sid.h" #include "util/os_time.h" #include "util/u_upload_mgr.h" @@ -134,6 +136,24 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, if (radeon_emitted(ctx->dma_cs, 0)) si_flush_dma_cs(ctx, flags, NULL); + if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) { + struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs; + si_compute_signal_gfx(ctx); + + /* Make sure compute shaders are idle before leaving the IB, so that + * the next IB doesn't overwrite GDS that might be in use. */ + radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | + EVENT_INDEX(4)); + + /* Save the GDS prim restart counter if needed. */ + if (ctx->preserve_prim_restart_gds_at_flush) { + si_cp_copy_data(ctx, compute_cs, + COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4, + COPY_DATA_GDS, NULL, 4); + } + } + if (ctx->has_graphics) { if (!LIST_IS_EMPTY(&ctx->active_queries)) si_suspend_queries(ctx); @@ -168,6 +188,32 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, si_log_hw_flush(ctx); } + if (si_compute_prim_discard_enabled(ctx)) { + /* The compute IB can start after the previous gfx IB starts. */ + if (radeon_emitted(ctx->prim_discard_compute_cs, 0) && + ctx->last_gfx_fence) { + ctx->ws->cs_add_fence_dependency(ctx->gfx_cs, + ctx->last_gfx_fence, + RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | + RADEON_DEPENDENCY_START_FENCE); + } + + /* Remember the last execution barrier. It's in the IB. + * It will signal the start of the next compute IB. + */ + if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && + ctx->last_pkt3_write_data) { + *ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0); + ctx->last_pkt3_write_data = NULL; + + si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf); + ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset; + si_resource_reference(&ctx->barrier_buf, NULL); + + ws->fence_reference(&ctx->last_ib_barrier_fence, NULL); + } + } + /* Flush the CS. */ ws->cs_flush(cs, flags, &ctx->last_gfx_fence); if (fence) @@ -175,6 +221,17 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, ctx->num_gfx_cs_flushes++; + if (si_compute_prim_discard_enabled(ctx)) { + /* Remember the last execution barrier, which is the last fence + * in this case. + */ + if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) { + ctx->last_pkt3_write_data = NULL; + si_resource_reference(&ctx->last_ib_barrier_buf, NULL); + ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence); + } + } + /* Check VM faults if needed. */ if (ctx->screen->debug_flags & DBG(CHECK_VM)) { /* Use conservative timeout 800ms, after which we won't wait any @@ -226,6 +283,16 @@ void si_begin_new_gfx_cs(struct si_context *ctx) if (ctx->is_debug) si_begin_gfx_cs_debug(ctx); + if (ctx->gds) { + ctx->ws->cs_add_buffer(ctx->gfx_cs, ctx->gds, + RADEON_USAGE_READWRITE, 0, 0); + if (ctx->gds_oa) { + ctx->ws->cs_add_buffer(ctx->gfx_cs, ctx->gds_oa, + RADEON_USAGE_READWRITE, 0, 0); + } + } + + /* Always invalidate caches at the beginning of IBs, because external * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our * buffers. @@ -352,6 +419,19 @@ void si_begin_new_gfx_cs(struct si_context *ctx) ctx->last_num_tcs_input_cp = -1; ctx->last_ls_hs_config = -1; /* impossible value */ + ctx->prim_discard_compute_ib_initialized = false; + + /* Compute-based primitive discard: + * The index ring is divided into 2 halves. Switch between the halves + * in the same fashion as doublebuffering. + */ + if (ctx->index_ring_base) + ctx->index_ring_base = 0; + else + ctx->index_ring_base = ctx->index_ring_size_per_ib; + + ctx->index_ring_offset = 0; + if (has_clear_state) { ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000; ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000; diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index d9dae8363f0..9f7159d66c4 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -80,6 +80,9 @@ static const struct debug_named_value debug_options[] = { { "zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations." }, /* 3D engine options: */ + { "alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader." }, + { "pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls." }, + { "nopd", DBG(NO_PD), "Disable the primitive discard compute shader." }, { "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." }, { "nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization" }, { "nodpbb", DBG(NO_DPBB), "Disable DPBB." }, @@ -255,7 +258,13 @@ static void si_destroy_context(struct pipe_context *context) sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL); sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL); + sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL); si_resource_reference(&sctx->eop_bug_scratch, NULL); + si_resource_reference(&sctx->index_ring, NULL); + si_resource_reference(&sctx->barrier_buf, NULL); + si_resource_reference(&sctx->last_ib_barrier_buf, NULL); + pb_reference(&sctx->gds, NULL); + pb_reference(&sctx->gds_oa, NULL); si_destroy_compiler(&sctx->compiler); @@ -533,6 +542,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, sctx->blitter->skip_viewport_restore = true; si_init_draw_functions(sctx); + si_initialize_prim_discard_tunables(sctx); } /* Initialize SDMA functions. */ @@ -554,7 +564,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, if (sctx->chip_class >= GFX9) { sctx->wait_mem_scratch = si_resource( - pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 4)); + pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 8)); if (!sctx->wait_mem_scratch) goto fail; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 488ae74f4c1..0d00a9b17b4 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -39,7 +39,7 @@ #endif #define ATI_VENDOR_ID 0x1002 - +#define SI_PRIM_DISCARD_DEBUG 0 #define SI_NOT_QUERY 0xffffffff /* The base vertex and primitive restart can be any number, but we must pick @@ -165,6 +165,9 @@ enum { DBG_ZERO_VRAM, /* 3D engine options: */ + DBG_ALWAYS_PD, + DBG_PD, + DBG_NO_PD, DBG_SWITCH_ON_EOP, DBG_NO_OUT_OF_ORDER, DBG_NO_DPBB, @@ -209,6 +212,7 @@ enum si_coherency { }; struct si_compute; +struct si_shader_context; struct hash_table; struct u_suballocator; @@ -675,6 +679,7 @@ struct si_signed_scissor { struct si_viewports { struct pipe_viewport_state states[SI_MAX_VIEWPORTS]; struct si_signed_scissor as_scissor[SI_MAX_VIEWPORTS]; + bool y_inverted; }; struct si_clip_state { @@ -780,10 +785,12 @@ struct si_saved_cs { struct pipe_reference reference; struct si_context *ctx; struct radeon_saved_cs gfx; + struct radeon_saved_cs compute; struct si_resource *trace_buf; unsigned trace_id; unsigned gfx_last_dw; + unsigned compute_last_dw; bool flushed; int64_t time_flush; }; @@ -839,6 +846,7 @@ struct si_context { struct pipe_debug_callback debug; struct ac_llvm_compiler compiler; /* only non-threaded compilation */ struct si_shader_ctx_state fixed_func_tcs_shader; + /* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */ struct si_resource *wait_mem_scratch; unsigned wait_mem_number; uint16_t prefetch_L2_mask; @@ -859,6 +867,31 @@ struct si_context { uint64_t vram; uint64_t gtt; + /* Compute-based primitive discard. */ + unsigned prim_discard_vertex_count_threshold; + struct pb_buffer *gds; + struct pb_buffer *gds_oa; + struct radeon_cmdbuf *prim_discard_compute_cs; + unsigned compute_gds_offset; + struct si_shader *compute_ib_last_shader; + uint32_t compute_rewind_va; + unsigned compute_num_prims_in_batch; + bool preserve_prim_restart_gds_at_flush; + /* index_ring is divided into 2 halves for doublebuffering. */ + struct si_resource *index_ring; + unsigned index_ring_base; /* offset of a per-IB portion */ + unsigned index_ring_offset; /* offset within a per-IB portion */ + unsigned index_ring_size_per_ib; /* max available size per IB */ + bool prim_discard_compute_ib_initialized; + /* For tracking the last execution barrier - it can be either + * a WRITE_DATA packet or a fence. */ + uint32_t *last_pkt3_write_data; + struct si_resource *barrier_buf; + unsigned barrier_buf_offset; + struct pipe_fence_handle *last_ib_barrier_fence; + struct si_resource *last_ib_barrier_buf; + unsigned last_ib_barrier_buf_offset; + /* Atoms (direct states). */ union si_state_atoms atoms; unsigned dirty_atoms; /* mask */ @@ -895,6 +928,7 @@ struct si_context { struct si_shader_ctx_state vs_shader; struct si_shader_ctx_state tcs_shader; struct si_shader_ctx_state tes_shader; + struct si_shader_ctx_state cs_prim_discard_state; struct si_cs_shader_state cs_shader_state; /* shader information */ @@ -963,6 +997,7 @@ struct si_context { /* Emitted draw state. */ bool gs_tri_strip_adj_fix:1; bool ls_vgpr_fix:1; + bool prim_discard_cs_instancing:1; int last_index_size; int last_base_vertex; int last_start_instance; @@ -1076,6 +1111,7 @@ struct si_context { /* Maintain the list of active queries for pausing between IBs. */ int num_occlusion_queries; int num_perfect_occlusion_queries; + int num_pipeline_stat_queries; struct list_head active_queries; unsigned num_cs_dw_queries_suspend; @@ -1311,6 +1347,26 @@ unsigned si_get_compute_resource_limits(struct si_screen *sscreen, unsigned threadgroups_per_cu); void si_init_compute_functions(struct si_context *sctx); +/* si_compute_prim_discard.c */ +enum si_prim_discard_outcome { + SI_PRIM_DISCARD_ENABLED, + SI_PRIM_DISCARD_DISABLED, + SI_PRIM_DISCARD_DRAW_SPLIT, +}; + +void si_build_prim_discard_compute_shader(struct si_shader_context *ctx); +enum si_prim_discard_outcome +si_prepare_prim_discard_or_split_draw(struct si_context *sctx, + const struct pipe_draw_info *info); +void si_compute_signal_gfx(struct si_context *sctx); +void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, + const struct pipe_draw_info *info, + unsigned index_size, + unsigned base_vertex, + uint64_t input_indexbuf_va, + unsigned input_indexbuf_max_elements); +void si_initialize_prim_discard_tunables(struct si_context *sctx); + /* si_perfcounters.c */ void si_init_perfcounters(struct si_screen *screen); void si_destroy_perfcounters(struct si_screen *screen); @@ -1748,6 +1804,11 @@ radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx, radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority); } +static inline bool si_compute_prim_discard_enabled(struct si_context *sctx) +{ + return sctx->prim_discard_vertex_count_threshold != UINT_MAX; +} + #define PRINT_ERR(fmt, args...) \ fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args) diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index 0e44d751288..1dd9249d57c 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -850,6 +850,9 @@ static void si_query_hw_emit_start(struct si_context *sctx, si_update_occlusion_query_state(sctx, query->b.type, 1); si_update_prims_generated_query_state(sctx, query->b.type, 1); + if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS) + sctx->num_pipeline_stat_queries++; + if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA) si_need_gfx_cs_space(sctx); @@ -954,6 +957,9 @@ static void si_query_hw_emit_stop(struct si_context *sctx, si_update_occlusion_query_state(sctx, query->b.type, -1); si_update_prims_generated_query_state(sctx, query->b.type, -1); + + if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS) + sctx->num_pipeline_stat_queries--; } static void emit_set_predicate(struct si_context *ctx, diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index c31e94fe351..7260a6b8df6 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -25,6 +25,7 @@ #include "util/u_memory.h" #include "util/u_string.h" #include "tgsi/tgsi_build.h" +#include "tgsi/tgsi_strings.h" #include "tgsi/tgsi_util.h" #include "tgsi/tgsi_dump.h" @@ -3548,6 +3549,33 @@ static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, FREE(outputs); } +static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct tgsi_shader_info *info = &ctx->shader->selector->info; + LLVMValueRef pos[4] = {}; + + assert(info->num_outputs <= max_outputs); + + for (unsigned i = 0; i < info->num_outputs; i++) { + if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION) + continue; + + for (unsigned chan = 0; chan < 4; chan++) + pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); + break; + } + assert(pos[0] != NULL); + + /* Return the position output. */ + LLVMValueRef ret = ctx->return_value; + for (unsigned chan = 0; chan < 4; chan++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, ""); + ctx->return_value = ret; +} + static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); @@ -4518,6 +4546,12 @@ static void create_function(struct si_shader_context *ctx) /* VGPRs */ declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs); + + /* Return values */ + if (shader->key.opt.vs_as_prim_discard_cs) { + for (i = 0; i < 4; i++) + returns[num_returns++] = ctx->f32; /* VGPRs */ + } break; case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */ @@ -5317,6 +5351,8 @@ const char *si_get_shader_name(const struct si_shader *shader, unsigned processo return "Vertex Shader as ES"; else if (shader->key.as_ls) return "Vertex Shader as LS"; + else if (shader->key.opt.vs_as_prim_discard_cs) + return "Vertex Shader as Primitive Discard CS"; else return "Vertex Shader as VS"; case PIPE_SHADER_TESS_CTRL: @@ -5699,6 +5735,28 @@ static void si_dump_shader_key(unsigned processor, const struct si_shader *shade fprintf(f, " as_ls = %u\n", key->as_ls); fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id); + fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", + key->opt.vs_as_prim_discard_cs); + fprintf(f, " opt.cs_prim_type = %s\n", + tgsi_primitive_names[key->opt.cs_prim_type]); + fprintf(f, " opt.cs_indexed = %u\n", + key->opt.cs_indexed); + fprintf(f, " opt.cs_instancing = %u\n", + key->opt.cs_instancing); + fprintf(f, " opt.cs_primitive_restart = %u\n", + key->opt.cs_primitive_restart); + fprintf(f, " opt.cs_provoking_vertex_first = %u\n", + key->opt.cs_provoking_vertex_first); + fprintf(f, " opt.cs_need_correct_orientation = %u\n", + key->opt.cs_need_correct_orientation); + fprintf(f, " opt.cs_cull_front = %u\n", + key->opt.cs_cull_front); + fprintf(f, " opt.cs_cull_back = %u\n", + key->opt.cs_cull_back); + fprintf(f, " opt.cs_cull_z = %u\n", + key->opt.cs_cull_z); + fprintf(f, " opt.cs_halfz_clip_space = %u\n", + key->opt.cs_halfz_clip_space); break; case PIPE_SHADER_TESS_CTRL: @@ -5854,6 +5912,8 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx) ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue; else if (shader->key.as_es) ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; + else if (shader->key.opt.vs_as_prim_discard_cs) + ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue; else ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; bld_base->emit_epilogue = si_tgsi_emit_epilogue; @@ -6644,6 +6704,9 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, si_build_wrapper_function(&ctx, parts + !need_prolog, 1 + need_prolog, need_prolog, 0); + + if (ctx.shader->key.opt.vs_as_prim_discard_cs) + si_build_prim_discard_compute_shader(&ctx); } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) { if (sscreen->info.chip_class >= GFX9) { struct si_shader_selector *ls = shader->key.part.tcs.ls; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 3a63292658b..bc9299bda66 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -340,6 +340,7 @@ struct si_shader_selector { unsigned type; bool vs_needs_prolog; bool force_correct_derivs_after_kill; + bool prim_discard_cs_allowed; unsigned pa_cl_vs_out_cntl; ubyte clipdist_mask; ubyte culldist_mask; @@ -554,6 +555,19 @@ struct si_shader_key { * possible, because it's in the "opt" group. */ unsigned prefer_mono:1; + + /* Primitive discard compute shader. */ + unsigned vs_as_prim_discard_cs:1; + unsigned cs_prim_type:4; + unsigned cs_indexed:1; + unsigned cs_instancing:1; + unsigned cs_primitive_restart:1; + unsigned cs_provoking_vertex_first:1; + unsigned cs_need_correct_orientation:1; + unsigned cs_cull_front:1; + unsigned cs_cull_back:1; + unsigned cs_cull_z:1; + unsigned cs_halfz_clip_space:1; } opt; }; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 7debbc1f3d4..f9e8adc9f5b 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -857,6 +857,15 @@ static void *si_create_rs_state(struct pipe_context *ctx, return NULL; } + if (!state->front_ccw) { + rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT); + rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK); + } else { + rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT); + rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK); + } + rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far; + rs->provoking_vertex_first = state->flatshade_first; rs->scissor_enable = state->scissor; rs->clip_halfz = state->clip_halfz; rs->two_side = state->light_twoside; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 613aa32a312..05e974d4c12 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -87,6 +87,10 @@ struct si_state_rasterizer { unsigned rasterizer_discard:1; unsigned scissor_enable:1; unsigned clip_halfz:1; + unsigned cull_front:1; + unsigned cull_back:1; + unsigned depth_clamp_any:1; + unsigned provoking_vertex_first:1; }; struct si_dsa_stencil_ref_part { @@ -600,6 +604,7 @@ void si_shader_selector_key_vs(struct si_context *sctx, struct si_vs_prolog_bits *prolog_key); /* si_state_draw.c */ +void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx); void si_emit_cache_flush(struct si_context *sctx); void si_trace_emit(struct si_context *sctx); void si_init_draw_functions(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index ad1cf246996..b4b3fe323ee 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -29,6 +29,7 @@ #include "util/u_log.h" #include "util/u_upload_mgr.h" #include "util/u_prim.h" +#include "util/u_suballoc.h" #include "ac_debug.h" @@ -676,7 +677,9 @@ static void si_emit_draw_packets(struct si_context *sctx, struct pipe_resource *indexbuf, unsigned index_size, unsigned index_offset, - unsigned instance_count) + unsigned instance_count, + bool dispatch_prim_discard_cs, + unsigned original_index_size) { struct pipe_draw_indirect_info *indirect = info->indirect; struct radeon_cmdbuf *cs = sctx->gfx_cs; @@ -735,13 +738,15 @@ static void si_emit_draw_packets(struct si_context *sctx, sctx->last_index_size = index_size; } - index_max_size = (indexbuf->width0 - index_offset) / - index_size; - index_va = si_resource(indexbuf)->gpu_address + index_offset; + if (original_index_size) { + index_max_size = (indexbuf->width0 - index_offset) / + original_index_size; + index_va = si_resource(indexbuf)->gpu_address + index_offset; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - si_resource(indexbuf), - RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER); + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + si_resource(indexbuf), + RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER); + } } else { /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE, * so the state must be re-emitted before the next indexed draw. @@ -828,7 +833,7 @@ static void si_emit_draw_packets(struct si_context *sctx, } /* Base vertex and start instance. */ - base_vertex = index_size ? info->index_bias : info->start; + base_vertex = original_index_size ? info->index_bias : info->start; if (sctx->num_vs_blit_sgprs) { /* Re-emit draw constants after we leave u_blitter. */ @@ -856,6 +861,17 @@ static void si_emit_draw_packets(struct si_context *sctx, } if (index_size) { + if (dispatch_prim_discard_cs) { + index_va += info->start * original_index_size; + index_max_size = MIN2(index_max_size, info->count); + + si_dispatch_prim_discard_cs_and_draw(sctx, info, + original_index_size, + base_vertex, + index_va, index_max_size); + return; + } + index_va += info->start * index_size; radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit)); @@ -902,6 +918,33 @@ static void si_emit_surface_sync(struct si_context *sctx, sctx->context_roll = true; } +void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx) +{ + if (!si_compute_prim_discard_enabled(sctx)) + return; + + if (!sctx->barrier_buf) { + u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4, + &sctx->barrier_buf_offset, + (struct pipe_resource**)&sctx->barrier_buf); + } + + /* Emit a placeholder to signal the next compute IB to start. + * See si_compute_prim_discard.c for explanation. + */ + uint32_t signal = 1; + si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset, + 4, V_370_MEM, V_370_ME, &signal); + + sctx->last_pkt3_write_data = + &sctx->gfx_cs->current.buf[sctx->gfx_cs->current.cdw - 5]; + + /* Only the last occurence of WRITE_DATA will be executed. + * The packet will be enabled in si_flush_gfx_cs. + */ + *sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0); +} + void si_emit_cache_flush(struct si_context *sctx) { struct radeon_cmdbuf *cs = sctx->gfx_cs; @@ -919,8 +962,18 @@ void si_emit_cache_flush(struct si_context *sctx) } uint32_t cp_coher_cntl = 0; - uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | - SI_CONTEXT_FLUSH_AND_INV_DB); + const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | + SI_CONTEXT_FLUSH_AND_INV_DB); + const bool is_barrier = flush_cb_db || + /* INV_ICACHE == beginning of gfx IB. Checking + * INV_ICACHE fixes corruption for DeusExMD with + * compute-based culling, but I don't know why. + */ + flags & (SI_CONTEXT_INV_ICACHE | + SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_VS_PARTIAL_FLUSH) || + (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && + sctx->compute_is_busy); if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) sctx->num_cb_cache_flushes++; @@ -1144,6 +1197,9 @@ void si_emit_cache_flush(struct si_context *sctx) if (cp_coher_cntl) si_emit_surface_sync(sctx, cp_coher_cntl); + if (is_barrier) + si_prim_discard_signal_next_compute_ib_start(sctx); + if (flags & SI_CONTEXT_START_PIPELINE_STATS) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | @@ -1260,6 +1316,94 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i primitive_restart); } +static bool +si_all_vs_resources_read_only(struct si_context *sctx, + struct pipe_resource *indexbuf) +{ + struct radeon_winsys *ws = sctx->ws; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + + /* Index buffer. */ + if (indexbuf && + ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf, + RADEON_USAGE_WRITE)) + return false; + + /* Vertex buffers. */ + struct si_vertex_elements *velems = sctx->vertex_elements; + unsigned num_velems = velems->count; + + for (unsigned i = 0; i < num_velems; i++) { + if (!((1 << i) & velems->first_vb_use_mask)) + continue; + + unsigned vb_index = velems->vertex_buffer_index[i]; + struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource; + if (!res) + continue; + + if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, + RADEON_USAGE_WRITE)) + return false; + } + + /* Constant and shader buffers. */ + struct si_descriptors *buffers = + &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)]; + for (unsigned i = 0; i < buffers->num_active_slots; i++) { + unsigned index = buffers->first_active_slot + i; + struct pipe_resource *res = + sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index]; + if (!res) + continue; + + if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, + RADEON_USAGE_WRITE)) + return false; + } + + /* Samplers. */ + struct si_shader_selector *vs = sctx->vs_shader.cso; + if (vs->info.samplers_declared) { + unsigned num_samplers = util_last_bit(vs->info.samplers_declared); + + for (unsigned i = 0; i < num_samplers; i++) { + struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i]; + if (!view) + continue; + + if (ws->cs_is_buffer_referenced(cs, + si_resource(view->texture)->buf, + RADEON_USAGE_WRITE)) + return false; + } + } + + /* Images. */ + if (vs->info.images_declared) { + unsigned num_images = util_last_bit(vs->info.images_declared); + + for (unsigned i = 0; i < num_images; i++) { + struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource; + if (!res) + continue; + + if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, + RADEON_USAGE_WRITE)) + return false; + } + } + + return true; +} + +static ALWAYS_INLINE bool pd_msg(const char *s) +{ + if (SI_PRIM_DISCARD_DEBUG) + printf("PD failed: %s\n", s); + return false; +} + static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) { struct si_context *sctx = (struct si_context *)ctx; @@ -1370,9 +1514,6 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i } } - if (sctx->do_update_shaders && !si_update_shaders(sctx)) - goto return_cleanup; - if (index_size) { /* Translate or upload, if needed. */ /* 8-bit indices are supported on GFX8. */ @@ -1425,6 +1566,11 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i } } + bool dispatch_prim_discard_cs = false; + bool prim_discard_cs_instancing = false; + unsigned original_index_size = index_size; + unsigned direct_count = 0; + if (info->indirect) { struct pipe_draw_indirect_info *indirect = info->indirect; @@ -1444,8 +1590,80 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false; } } + } else { + direct_count = info->count * instance_count; + } + + /* Determine if we can use the primitive discard compute shader. */ + if (si_compute_prim_discard_enabled(sctx) && + /* Multiply by 3 for strips and fans to get the vertex count as triangles. */ + direct_count * (prim == PIPE_PRIM_TRIANGLES ? 1 : 3) > + sctx->prim_discard_vertex_count_threshold && + (!info->count_from_stream_output || pd_msg("draw_opaque")) && + (primitive_restart ? + /* Supported prim types with primitive restart: */ + (prim == PIPE_PRIM_TRIANGLE_STRIP || pd_msg("bad prim type with primitive restart")) && + /* Disallow instancing with primitive restart: */ + (instance_count == 1 || pd_msg("instance_count > 1 with primitive restart")) : + /* Supported prim types without primitive restart + allow instancing: */ + (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | + (1 << PIPE_PRIM_TRIANGLE_STRIP) | + (1 << PIPE_PRIM_TRIANGLE_FAN)) && + /* Instancing is limited to 16-bit indices, because InstanceID is packed into VertexID. */ + /* TODO: DrawArraysInstanced doesn't sometimes work, so it's disabled. */ + (instance_count == 1 || + (instance_count <= USHRT_MAX && index_size && index_size <= 2) || + pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced"))) && + (info->drawid == 0 || !sctx->vs_shader.cso->info.uses_drawid || pd_msg("draw_id > 0")) && + (!sctx->render_cond || pd_msg("render condition")) && + /* Forced enablement ignores pipeline statistics queries. */ + (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) || + (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) || + pd_msg("pipestat or primgen query")) && + (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) && + (!sctx->tes_shader.cso || pd_msg("uses tess")) && + (!sctx->gs_shader.cso || pd_msg("uses GS")) && + (!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) && +#if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */ + (!sctx->vs_shader.cso->info.uses_bindless_images || pd_msg("uses bindless images")) && + (!sctx->vs_shader.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) && + (!sctx->vs_shader.cso->info.writes_memory || pd_msg("writes memory")) && + (!sctx->vs_shader.cso->info.writes_viewport_index || pd_msg("writes viewport index")) && + !sctx->vs_shader.cso->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] && + !sctx->vs_shader.cso->so.num_outputs && +#else + (sctx->vs_shader.cso->prim_discard_cs_allowed || pd_msg("VS shader uses unsupported features")) && +#endif + /* Check that all buffers are used for read only, because compute + * dispatches can run ahead. */ + (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) || pd_msg("write reference"))) { + switch (si_prepare_prim_discard_or_split_draw(sctx, info)) { + case SI_PRIM_DISCARD_ENABLED: + original_index_size = index_size; + prim_discard_cs_instancing = instance_count > 1; + dispatch_prim_discard_cs = true; + + /* The compute shader changes/lowers the following: */ + prim = PIPE_PRIM_TRIANGLES; + index_size = 4; + instance_count = 1; + primitive_restart = false; + break; + case SI_PRIM_DISCARD_DISABLED: + break; + case SI_PRIM_DISCARD_DRAW_SPLIT: + goto return_cleanup; + } } + if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) { + sctx->prim_discard_cs_instancing = prim_discard_cs_instancing; + sctx->do_update_shaders = true; + } + + if (sctx->do_update_shaders && !si_update_shaders(sctx)) + goto return_cleanup; + si_need_gfx_cs_space(sctx); if (sctx->bo_list_add_all_gfx_resources) @@ -1507,7 +1725,8 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i sctx->dirty_atoms = 0; si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, - instance_count); + instance_count, dispatch_prim_discard_cs, + original_index_size); /* <-- CUs are busy here. */ /* Start prefetches after the draw has been started. Both will run @@ -1527,7 +1746,7 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i cik_emit_prefetch_L2(sctx, true); if (!si_upload_graphics_shader_descriptors(sctx)) - return; + goto return_cleanup; si_emit_all_states(sctx, info, prim, instance_count, primitive_restart, masked_atoms); @@ -1540,7 +1759,8 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i sctx->dirty_atoms = 0; si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, - instance_count); + instance_count, dispatch_prim_discard_cs, + original_index_size); /* Prefetch the remaining shaders after the draw has been * started. */ diff --git a/src/gallium/drivers/radeonsi/si_state_msaa.c b/src/gallium/drivers/radeonsi/si_state_msaa.c index e6d97fe6727..0fa38918b20 100644 --- a/src/gallium/drivers/radeonsi/si_state_msaa.c +++ b/src/gallium/drivers/radeonsi/si_state_msaa.c @@ -82,6 +82,10 @@ * Right half: {1,3,5,7,9,11,13,15} */ +/* Important note: We have to use the standard DX positions, because + * the primitive discard compute shader relies on them. + */ + /* 1x MSAA */ static const uint32_t sample_locs_1x = FILL_SREG( 0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 28360aae965..628844df7e3 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1383,6 +1383,8 @@ void si_shader_selector_key_vs(struct si_context *sctx, prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one; prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched; + prolog_key->unpack_instance_id_from_vertex_id = + sctx->prim_discard_cs_instancing; /* Prefer a monolithic shader to allow scheduling divisions around * VBO loads. */ @@ -1910,8 +1912,11 @@ current_not_ready: /* Compile the main shader part if it doesn't exist. This can happen * if the initial guess was wrong. + * + * The prim discard CS doesn't need the main shader part. */ - if (!is_pure_monolithic) { + if (!is_pure_monolithic && + !key->opt.vs_as_prim_discard_cs) { bool ok; /* Make sure the main shader part is present. This is needed @@ -1962,9 +1967,10 @@ current_not_ready: is_pure_monolithic || memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0; + /* The prim discard CS is always optimized. */ shader->is_optimized = - !is_pure_monolithic && - memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0; + (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) && + memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0; /* If it's an optimized shader, compile it asynchronously. */ if (shader->is_optimized && thread_index < 0) { @@ -2312,6 +2318,15 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->info.uses_kill && sctx->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL); + sel->prim_discard_cs_allowed = + sel->type == PIPE_SHADER_VERTEX && + !sel->info.uses_bindless_images && + !sel->info.uses_bindless_samplers && + !sel->info.writes_memory && + !sel->info.writes_viewport_index && + !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] && + !sel->so.num_outputs; + /* Set which opcode uses which (i,j) pair. */ if (sel->info.uses_persp_opcode_interp_centroid) sel->info.uses_persp_centroid = true; diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index a144d7b661c..39c8536e46a 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -381,6 +381,12 @@ static void si_set_viewport_states(struct pipe_context *pctx, scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH; } + if (start_slot == 0) { + ctx->viewports.y_inverted = + -state->scale[1] + state->translate[1] > + state->scale[1] + state->translate[1]; + } + si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports); si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband); si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); -- 2.30.2