radeonsi: move VS_STATE.LS_OUT_PATCH_SIZE a few bits higher to make space there
[mesa.git] / src / gallium / drivers / radeonsi / si_compute_prim_discard.c
index a52966f2376ee6fa3c9e883f233f83eaf11784a7..4e691d088c2b566c85b69cff8ec536d2a7bf5d0e 100644 (file)
 /* For emulating the rewind packet on CI. */
 #define FORCE_REWIND_EMULATION         0
 
-void si_initialize_prim_discard_tunables(struct si_context *sctx)
+void si_initialize_prim_discard_tunables(struct si_screen *sscreen,
+                                        bool is_aux_context,
+                                        unsigned *prim_discard_vertex_count_threshold,
+                                        unsigned *index_ring_size_per_ib)
 {
-       sctx->prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
+       *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
 
-       if (sctx->chip_class == GFX6 || /* SI support is not implemented */
-           !sctx->screen->info.has_gds_ordered_append ||
-           sctx->screen->debug_flags & DBG(NO_PD) ||
-           /* If aux_context == NULL, we are initializing aux_context right now. */
-           !sctx->screen->aux_context)
+       if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */
+           !sscreen->info.has_gds_ordered_append ||
+           sscreen->debug_flags & DBG(NO_PD) ||
+           is_aux_context)
                return;
 
        /* TODO: enable this after the GDS kernel memory management is fixed */
        bool enable_on_pro_graphics_by_default = false;
 
-       if (sctx->screen->debug_flags & DBG(ALWAYS_PD) ||
-           sctx->screen->debug_flags & DBG(PD) ||
+       if (sscreen->debug_flags & DBG(ALWAYS_PD) ||
+           sscreen->debug_flags & DBG(PD) ||
            (enable_on_pro_graphics_by_default &&
-            sctx->screen->info.is_pro_graphics &&
-            (sctx->family == CHIP_BONAIRE ||
-             sctx->family == CHIP_HAWAII ||
-             sctx->family == CHIP_TONGA ||
-             sctx->family == CHIP_FIJI ||
-             sctx->family == CHIP_POLARIS10 ||
-             sctx->family == CHIP_POLARIS11 ||
-             sctx->family == CHIP_VEGA10 ||
-             sctx->family == CHIP_VEGA20))) {
-               sctx->prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
-
-               if (sctx->screen->debug_flags & DBG(ALWAYS_PD))
-                       sctx->prim_discard_vertex_count_threshold = 0; /* always enable */
+            sscreen->info.is_pro_graphics &&
+            (sscreen->info.family == CHIP_BONAIRE ||
+             sscreen->info.family == CHIP_HAWAII ||
+             sscreen->info.family == CHIP_TONGA ||
+             sscreen->info.family == CHIP_FIJI ||
+             sscreen->info.family == CHIP_POLARIS10 ||
+             sscreen->info.family == CHIP_POLARIS11 ||
+             sscreen->info.family == CHIP_VEGA10 ||
+             sscreen->info.family == CHIP_VEGA20))) {
+               *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
+
+               if (sscreen->debug_flags & DBG(ALWAYS_PD))
+                       *prim_discard_vertex_count_threshold = 0; /* always enable */
 
                const uint32_t MB = 1024 * 1024;
                const uint64_t GB = 1024 * 1024 * 1024;
@@ -224,12 +226,12 @@ void si_initialize_prim_discard_tunables(struct si_context *sctx)
                /* The total size is double this per context.
                 * Greater numbers allow bigger gfx IBs.
                 */
-               if (sctx->screen->info.vram_size <= 2 * GB)
-                       sctx->index_ring_size_per_ib = 64 * MB;
-               else if (sctx->screen->info.vram_size <= 4 * GB)
-                       sctx->index_ring_size_per_ib = 128 * MB;
+               if (sscreen->info.vram_size <= 2 * GB)
+                       *index_ring_size_per_ib = 64 * MB;
+               else if (sscreen->info.vram_size <= 4 * GB)
+                       *index_ring_size_per_ib = 128 * MB;
                else
-                       sctx->index_ring_size_per_ib = 256 * MB;
+                       *index_ring_size_per_ib = 256 * MB;
        }
 }
 
@@ -362,7 +364,7 @@ void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
        /* Create the compute shader function. */
        unsigned old_type = ctx->type;
        ctx->type = PIPE_SHADER_COMPUTE;
-       si_create_function(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
+       si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
        ctx->type = old_type;
 
        if (VERTEX_COUNTER_GDS_MODE == 1) {
@@ -822,7 +824,7 @@ void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
                        };
                        LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
                        ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->i32_0,
-                                                   ctx->i32_0, 0, ac_glc | ac_slc, false);
+                                                   ctx->i32_0, 0, ac_glc | ac_slc);
                } else {
                        LLVMBuildStore(builder, count,
                                       si_expand_32bit_pointer(ctx,
@@ -939,6 +941,9 @@ static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
        sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
        sctx->cs_prim_discard_state.current = NULL;
 
+       if (!sctx->compiler.passes)
+               si_init_compiler(sctx->screen, &sctx->compiler);
+
        struct si_compiler_ctx_state compiler_state;
        compiler_state.compiler = &sctx->compiler;
        compiler_state.debug = sctx->debug;
@@ -1310,50 +1315,18 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
                  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
                  S_008F0C_DATA_FORMAT(output_indexbuf_format);
 
-       /* Viewport state.
-        * This is needed by the small primitive culling, because it's done
-        * in screen space.
-        */
-       float scale[2], translate[2];
-
-       scale[0] = sctx->viewports.states[0].scale[0];
-       scale[1] = sctx->viewports.states[0].scale[1];
-       translate[0] = sctx->viewports.states[0].translate[0];
-       translate[1] = sctx->viewports.states[0].translate[1];
+       /* Viewport state. */
+       struct si_small_prim_cull_info cull_info;
+       si_get_small_prim_cull_info(sctx, &cull_info);
 
-       /* The viewport shouldn't flip the X axis for the small prim culling to work. */
-       assert(-scale[0] + translate[0] <= scale[0] + translate[0]);
-
-       /* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
-        * This is because the viewport transformation inverts the clip space
-        * bounding box, so min becomes max, which breaks small primitive
-        * culling.
-        */
-       if (sctx->viewports.y_inverted) {
-               scale[1] = -scale[1];
-               translate[1] = -translate[1];
-       }
-
-       /* Scale the framebuffer up, so that samples become pixels and small
-        * primitive culling is the same for all sample counts.
-        * This only works with the standard DX sample positions, because
-        * the samples are evenly spaced on both X and Y axes.
-        */
-       unsigned num_samples = sctx->framebuffer.nr_samples;
-       assert(num_samples >= 1);
-
-       for (unsigned i = 0; i < 2; i++) {
-               scale[i] *= num_samples;
-               translate[i] *= num_samples;
-       }
-
-       desc[8] = fui(scale[0]);
-       desc[9] = fui(scale[1]);
-       desc[10] = fui(translate[0]);
-       desc[11] = fui(translate[1]);
+       desc[8] = fui(cull_info.scale[0]);
+       desc[9] = fui(cull_info.scale[1]);
+       desc[10] = fui(cull_info.translate[0]);
+       desc[11] = fui(cull_info.translate[1]);
 
        /* Better subpixel precision increases the efficiency of small
         * primitive culling. */
+       unsigned num_samples = sctx->framebuffer.nr_samples;
        unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
        float small_prim_cull_precision;