anv,radv: Drop XML workarounds for VK_ANDROID_native_buffer
[mesa.git] / src / amd / vulkan / si_cmd_buffer.c
index f5dc26d392e365797ec003aa32a101abadf40c33..aed291be35d68349437c83a0cb72bc31d0b33419 100644 (file)
@@ -147,9 +147,10 @@ si_write_harvested_raster_configs(struct radv_physical_device *physical_device,
 
                /* GRBM_GFX_INDEX has a different offset on SI and CI+ */
                if (physical_device->rad_info.chip_class < CIK)
-                       radeon_set_config_reg(cs, GRBM_GFX_INDEX,
-                                             SE_INDEX(se) | SH_BROADCAST_WRITES |
-                                             INSTANCE_BROADCAST_WRITES);
+                       radeon_set_config_reg(cs, R_00802C_GRBM_GFX_INDEX,
+                                             S_00802C_SE_INDEX(se) |
+                                             S_00802C_SH_BROADCAST_WRITES(1) |
+                                             S_00802C_INSTANCE_BROADCAST_WRITES(1));
                else
                        radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
                                               S_030800_SE_INDEX(se) | S_030800_SH_BROADCAST_WRITES(1) |
@@ -161,9 +162,10 @@ si_write_harvested_raster_configs(struct radv_physical_device *physical_device,
 
        /* GRBM_GFX_INDEX has a different offset on SI and CI+ */
        if (physical_device->rad_info.chip_class < CIK)
-               radeon_set_config_reg(cs, GRBM_GFX_INDEX,
-                                     SE_BROADCAST_WRITES | SH_BROADCAST_WRITES |
-                                     INSTANCE_BROADCAST_WRITES);
+               radeon_set_config_reg(cs, R_00802C_GRBM_GFX_INDEX,
+                                     S_00802C_SE_BROADCAST_WRITES(1) |
+                                     S_00802C_SH_BROADCAST_WRITES(1) |
+                                     S_00802C_INSTANCE_BROADCAST_WRITES(1));
        else
                radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
                                       S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) |
@@ -179,7 +181,8 @@ si_emit_compute(struct radv_physical_device *physical_device,
        radeon_emit(cs, 0);
        radeon_emit(cs, 0);
 
-       radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 3);
+       radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+                             S_00B854_WAVES_PER_SH(0x3));
        radeon_emit(cs, 0);
        /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */
        radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
@@ -332,29 +335,41 @@ si_emit_config(struct radv_physical_device *physical_device,
 {
        int i;
 
+       /* Only SI can disable CLEAR_STATE for now. */
+       assert(physical_device->has_clear_state ||
+              physical_device->rad_info.chip_class == SI);
+
        radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
        radeon_emit(cs, CONTEXT_CONTROL_LOAD_ENABLE(1));
        radeon_emit(cs, CONTEXT_CONTROL_SHADOW_ENABLE(1));
 
+       if (physical_device->has_clear_state) {
+               radeon_emit(cs, PKT3(PKT3_CLEAR_STATE, 0, 0));
+               radeon_emit(cs, 0);
+       }
+
        if (physical_device->rad_info.chip_class <= VI)
                si_set_raster_config(physical_device, cs);
 
        radeon_set_context_reg(cs, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
-       radeon_set_context_reg(cs, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
+       if (!physical_device->has_clear_state)
+               radeon_set_context_reg(cs, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
 
        /* FIXME calculate these values somehow ??? */
-       radeon_set_context_reg(cs, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
-       radeon_set_context_reg(cs, R_028A58_VGT_ES_PER_GS, 0x40);
-       radeon_set_context_reg(cs, R_028A5C_VGT_GS_PER_VS, 0x2);
+       if (physical_device->rad_info.chip_class <= VI) {
+               radeon_set_context_reg(cs, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
+               radeon_set_context_reg(cs, R_028A58_VGT_ES_PER_GS, 0x40);
+       }
 
-       radeon_set_context_reg(cs, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
-       radeon_set_context_reg(cs, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
+       if (!physical_device->has_clear_state) {
+               radeon_set_context_reg(cs, R_028A5C_VGT_GS_PER_VS, 0x2);
+               radeon_set_context_reg(cs, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
+               radeon_set_context_reg(cs, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
+       }
 
-       radeon_set_context_reg(cs, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
        radeon_set_context_reg(cs, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
-       if (physical_device->rad_info.chip_class >= GFX9)
-               radeon_set_context_reg(cs, R_028AB4_VGT_REUSE_OFF, 0);
-       radeon_set_context_reg(cs, R_028AB8_VGT_VTX_CNT_EN, 0x0);
+       if (!physical_device->has_clear_state)
+               radeon_set_context_reg(cs, R_028AB8_VGT_VTX_CNT_EN, 0x0);
        if (physical_device->rad_info.chip_class < CIK)
                radeon_set_config_reg(cs, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) |
                                      S_008A14_CLIP_VTX_REORDER_ENA(1));
@@ -362,30 +377,43 @@ si_emit_config(struct radv_physical_device *physical_device,
        radeon_set_context_reg(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 0x76543210);
        radeon_set_context_reg(cs, R_028BD8_PA_SC_CENTROID_PRIORITY_1, 0xfedcba98);
 
-       radeon_set_context_reg(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0);
+       if (!physical_device->has_clear_state)
+               radeon_set_context_reg(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0);
+
+       /* CLEAR_STATE doesn't clear these correctly on certain generations.
+        * I don't know why. Deduced by trial and error.
+        */
+       if (physical_device->rad_info.chip_class <= CIK) {
+               radeon_set_context_reg(cs, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
+               radeon_set_context_reg(cs, R_028204_PA_SC_WINDOW_SCISSOR_TL,
+                                      S_028204_WINDOW_OFFSET_DISABLE(1));
+               radeon_set_context_reg(cs, R_028240_PA_SC_GENERIC_SCISSOR_TL,
+                                      S_028240_WINDOW_OFFSET_DISABLE(1));
+               radeon_set_context_reg(cs, R_028244_PA_SC_GENERIC_SCISSOR_BR,
+                                      S_028244_BR_X(16384) | S_028244_BR_Y(16384));
+               radeon_set_context_reg(cs, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0);
+               radeon_set_context_reg(cs, R_028034_PA_SC_SCREEN_SCISSOR_BR,
+                                      S_028034_BR_X(16384) | S_028034_BR_Y(16384));
+       }
 
-       for (i = 0; i < 16; i++) {
-               radeon_set_context_reg(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 + i*8, 0);
-               radeon_set_context_reg(cs, R_0282D4_PA_SC_VPORT_ZMAX_0 + i*8, fui(1.0));
+       if (!physical_device->has_clear_state) {
+               for (i = 0; i < 16; i++) {
+                       radeon_set_context_reg(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 + i*8, 0);
+                       radeon_set_context_reg(cs, R_0282D4_PA_SC_VPORT_ZMAX_0 + i*8, fui(1.0));
+               }
+       }
+
+       if (!physical_device->has_clear_state) {
+               radeon_set_context_reg(cs, R_02820C_PA_SC_CLIPRECT_RULE, 0xFFFF);
+               radeon_set_context_reg(cs, R_028230_PA_SC_EDGERULE, 0xAAAAAAAA);
+               /* PA_SU_HARDWARE_SCREEN_OFFSET must be 0 due to hw bug on SI */
+               radeon_set_context_reg(cs, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, 0);
+               radeon_set_context_reg(cs, R_028820_PA_CL_NANINF_CNTL, 0);
+               radeon_set_context_reg(cs, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
+               radeon_set_context_reg(cs, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
+               radeon_set_context_reg(cs, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
        }
 
-       radeon_set_context_reg(cs, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
-       radeon_set_context_reg(cs, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1));
-       radeon_set_context_reg(cs, R_028244_PA_SC_GENERIC_SCISSOR_BR,
-                              S_028244_BR_X(16384) | S_028244_BR_Y(16384));
-       radeon_set_context_reg(cs, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0);
-       radeon_set_context_reg(cs, R_028034_PA_SC_SCREEN_SCISSOR_BR,
-                              S_028034_BR_X(16384) | S_028034_BR_Y(16384));
-
-       radeon_set_context_reg(cs, R_02820C_PA_SC_CLIPRECT_RULE, 0xFFFF);
-       radeon_set_context_reg(cs, R_028230_PA_SC_EDGERULE, 0xAAAAAAAA);
-       /* PA_SU_HARDWARE_SCREEN_OFFSET must be 0 due to hw bug on SI */
-       radeon_set_context_reg(cs, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, 0);
-       radeon_set_context_reg(cs, R_028820_PA_CL_NANINF_CNTL, 0);
-
-       radeon_set_context_reg(cs, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
-       radeon_set_context_reg(cs, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
-       radeon_set_context_reg(cs, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
        radeon_set_context_reg(cs, R_02800C_DB_RENDER_OVERRIDE,
                               S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
                               S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE));
@@ -395,6 +423,11 @@ si_emit_config(struct radv_physical_device *physical_device,
                radeon_set_uconfig_reg(cs, R_030924_VGT_MIN_VTX_INDX, 0);
                radeon_set_uconfig_reg(cs, R_030928_VGT_INDX_OFFSET, 0);
        } else {
+               /* These registers, when written, also overwrite the
+                * CLEAR_STATE context, so we can't rely on CLEAR_STATE setting
+                * them.  It would be an issue if there was another UMD
+                * changing them.
+                */
                radeon_set_context_reg(cs, R_028400_VGT_MAX_VTX_INDX, ~0);
                radeon_set_context_reg(cs, R_028404_VGT_MIN_VTX_INDX, 0);
                radeon_set_context_reg(cs, R_028408_VGT_INDX_OFFSET, 0);
@@ -402,11 +435,15 @@ si_emit_config(struct radv_physical_device *physical_device,
 
        if (physical_device->rad_info.chip_class >= CIK) {
                if (physical_device->rad_info.chip_class >= GFX9) {
-                       radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_CU_EN(0xffff));
+                       radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
+                                         S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
                } else {
-                       radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff));
-                       radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0);
-                       radeon_set_sh_reg(cs, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xffff));
+                       radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
+                                         S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
+                       radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
+                                         S_00B41C_WAVE_LIMIT(0x3F));
+                       radeon_set_sh_reg(cs, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
+                                         S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
                        /* If this is 0, Bonaire can hang even if GS isn't being used.
                         * Other chips are unaffected. These are suboptimal values,
                         * but we don't use on-chip GS.
@@ -415,7 +452,8 @@ si_emit_config(struct radv_physical_device *physical_device,
                                               S_028A44_ES_VERTS_PER_SUBGRP(64) |
                                               S_028A44_GS_PRIMS_PER_SUBGRP(4));
                }
-               radeon_set_sh_reg(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff));
+               radeon_set_sh_reg(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                                 S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
 
                if (physical_device->rad_info.num_good_compute_units /
                    (physical_device->rad_info.max_se * physical_device->rad_info.max_sh_per_se) <= 4) {
@@ -425,7 +463,8 @@ si_emit_config(struct radv_physical_device *physical_device,
                         *
                         * LATE_ALLOC_VS = 2 is the highest safe number.
                         */
-                       radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xffff));
+                       radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
+                                         S_00B118_CU_EN(0xffff) | S_00B118_WAVE_LIMIT(0x3F) );
                        radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(2));
                } else {
                        /* Set LATE_ALLOC_VS == 31. It should be less than
@@ -433,11 +472,13 @@ si_emit_config(struct radv_physical_device *physical_device,
                         * - VS can't execute on CU0.
                         * - If HS writes outputs to LDS, LS can't execute on CU0.
                         */
-                       radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xfffe));
+                       radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
+                                         S_00B118_CU_EN(0xfffe) | S_00B118_WAVE_LIMIT(0x3F));
                        radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(31));
                }
 
-               radeon_set_sh_reg(cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff));
+               radeon_set_sh_reg(cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
+                                 S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
        }
 
        if (physical_device->rad_info.chip_class >= VI) {
@@ -445,9 +486,6 @@ si_emit_config(struct radv_physical_device *physical_device,
                radeon_set_context_reg(cs, R_028424_CB_DCC_CONTROL,
                                       S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1) |
                                       S_028424_OVERWRITE_COMBINER_WATERMARK(4));
-               if (physical_device->rad_info.family < CHIP_POLARIS10)
-                       radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 30);
-               radeon_set_context_reg(cs, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32);
 
                vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) |
                        S_028B50_ACCUM_TRI(11) |
@@ -460,20 +498,18 @@ si_emit_config(struct radv_physical_device *physical_device,
 
                radeon_set_context_reg(cs, R_028B50_VGT_TESS_DISTRIBUTION,
                                       vgt_tess_distribution);
-       } else {
+       } else if (!physical_device->has_clear_state) {
                radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
                radeon_set_context_reg(cs, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
        }
 
-       if (physical_device->has_rbplus)
-               radeon_set_context_reg(cs, R_028C40_PA_SC_SHADER_CONTROL, 0);
-
        if (physical_device->rad_info.chip_class >= GFX9) {
                unsigned num_se = physical_device->rad_info.max_se;
                unsigned pc_lines = 0;
 
                switch (physical_device->rad_info.family) {
                case CHIP_VEGA10:
+               case CHIP_VEGA12:
                        pc_lines = 4096;
                        break;
                case CHIP_RAVEN:
@@ -483,16 +519,6 @@ si_emit_config(struct radv_physical_device *physical_device,
                        assert(0);
                }
 
-               radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
-                                      S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF));
-               radeon_set_context_reg(cs, R_028064_DB_RENDER_FILTER, 0);
-               /* TODO: We can use this to disable RBs for rendering to GART: */
-               radeon_set_context_reg(cs, R_02835C_PA_SC_TILE_STEERING_OVERRIDE, 0);
-               radeon_set_context_reg(cs, R_02883C_PA_SU_OVER_RASTERIZATION_CNTL, 0);
-               /* TODO: Enable the binner: */
-               radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
-                                      S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
-                                      S_028C44_DISABLE_START_OF_PRIM(1));
                radeon_set_context_reg(cs, R_028C48_PA_SC_BINNER_CNTL_1,
                                       S_028C48_MAX_ALLOC_COUNT(MIN2(128, pc_lines / (4 * num_se))) |
                                       S_028C48_MAX_PRIM_PER_BATCH(1023));
@@ -508,6 +534,26 @@ si_emit_config(struct radv_physical_device *physical_device,
        radeon_emit(cs, S_028A04_MIN_SIZE(radv_pack_float_12p4(0)) |
                    S_028A04_MAX_SIZE(radv_pack_float_12p4(8192/2)));
 
+       if (!physical_device->has_clear_state) {
+               radeon_set_context_reg(cs, R_028004_DB_COUNT_CONTROL,
+                                      S_028004_ZPASS_INCREMENT_DISABLE(1));
+       }
+
+       /* Enable the Polaris small primitive filter control.
+        * XXX: There is possibly an issue when MSAA is off (see RadeonSI
+        * has_msaa_sample_loc_bug). But this doesn't seem to regress anything,
+        * and AMDVLK doesn't have a workaround as well.
+        */
+       if (physical_device->rad_info.family >= CHIP_POLARIS10) {
+               unsigned small_prim_filter_cntl =
+                       S_028830_SMALL_PRIM_FILTER_ENABLE(1) |
+                       /* Workaround for a hw line bug. */
+                       S_028830_LINE_FILTER_DISABLE(physical_device->rad_info.family <= CHIP_POLARIS12);
+
+               radeon_set_context_reg(cs, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
+                                      small_prim_filter_cntl);
+       }
+
        si_emit_compute(physical_device, cs);
 }
 
@@ -537,7 +583,9 @@ cik_create_gfx_config(struct radv_device *device)
        device->gfx_init = device->ws->buffer_create(device->ws,
                                                     cs->cdw * 4, 4096,
                                                     RADEON_DOMAIN_GTT,
-                                                    RADEON_FLAG_CPU_ACCESS);
+                                                    RADEON_FLAG_CPU_ACCESS|
+                                                    RADEON_FLAG_NO_INTERPROCESS_SHARING |
+                                                    RADEON_FLAG_READ_ONLY);
        if (!device->gfx_init)
                goto fail;
 
@@ -642,7 +690,8 @@ si_write_scissors(struct radeon_winsys_cs *cs, int first,
        int i;
        float scale[3], translate[3], guardband_x = INFINITY, guardband_y = INFINITY;
        const float max_range = 32767.0f;
-       assert(count);
+       if (!count)
+               return;
 
        radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL + first * 4 * 2, count * 2);
        for (i = 0; i < count; i++) {
@@ -708,21 +757,21 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
        bool ia_switch_on_eop = false;
        bool ia_switch_on_eoi = false;
        bool partial_vs_wave = false;
-       bool partial_es_wave = cmd_buffer->state.pipeline->graphics.partial_es_wave;
+       bool partial_es_wave = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.partial_es_wave;
        bool multi_instances_smaller_than_primgroup;
 
        multi_instances_smaller_than_primgroup = indirect_draw;
        if (!multi_instances_smaller_than_primgroup && instanced_draw) {
                uint32_t num_prims = radv_prims_for_vertices(&cmd_buffer->state.pipeline->graphics.prim_vertex_count, draw_vertex_count);
-               if (num_prims < cmd_buffer->state.pipeline->graphics.primgroup_size)
+               if (num_prims < cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.primgroup_size)
                        multi_instances_smaller_than_primgroup = true;
        }
 
-       ia_switch_on_eoi = cmd_buffer->state.pipeline->graphics.ia_switch_on_eoi;
-       partial_vs_wave = cmd_buffer->state.pipeline->graphics.partial_vs_wave;
+       ia_switch_on_eoi = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.ia_switch_on_eoi;
+       partial_vs_wave = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.partial_vs_wave;
 
        if (chip_class >= CIK) {
-               wd_switch_on_eop = cmd_buffer->state.pipeline->graphics.wd_switch_on_eop;
+               wd_switch_on_eop = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.wd_switch_on_eop;
 
                /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.
                 * We don't know that for indirect drawing, so treat it as
@@ -782,7 +831,7 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
                }
        }
 
-       return cmd_buffer->state.pipeline->graphics.base_ia_multi_vgt_param |
+       return cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.base |
                S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) |
                S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
                S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
@@ -884,7 +933,6 @@ si_emit_acquire_mem(struct radeon_winsys_cs *cs,
 
 void
 si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
-                      bool predicated,
                        enum chip_class chip_class,
                       uint32_t *flush_cnt,
                       uint64_t flush_va,
@@ -915,7 +963,7 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
                        /* Necessary for DCC */
                        if (chip_class >= VI) {
                                si_cs_emit_write_event_eop(cs,
-                                                          predicated,
+                                                          false,
                                                           chip_class,
                                                           is_mec,
                                                           V_028A90_FLUSH_AND_INV_CB_DATA_TS,
@@ -929,33 +977,36 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
        }
 
        if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
+               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
        }
 
        if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
+               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
        }
 
-       if (!flush_cb_db) {
-               if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
-                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
-                       radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-               } else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) {
-                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
-                       radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-               }
+       if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
+               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+               radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+       } else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) {
+               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+               radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
        }
 
        if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
+               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
        }
 
        if (chip_class >= GFX9 && flush_cb_db) {
                unsigned cb_db_event, tc_flags;
 
+#if 0
+               /* This breaks a bunch of:
+                  dEQP-VK.renderpass.dedicated_allocation.formats.d32_sfloat_s8_uint.input*.
+                  use the big hammer always.
+               */
                /* Set the CB/DB flush event. */
                switch (flush_cb_db) {
                case RADV_CMD_FLAG_FLUSH_AND_INV_CB:
@@ -968,25 +1019,30 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
                        /* both CB & DB */
                        cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
                }
-
-               /* TC    | TC_WB         = invalidate L2 data
-                * TC_MD | TC_WB         = invalidate L2 metadata
-                * TC    | TC_WB | TC_MD = invalidate L2 data & metadata
+#else
+               cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
+#endif
+               /* These are the only allowed combinations. If you need to
+                * do multiple operations at once, do them separately.
+                * All operations that invalidate L2 also seem to invalidate
+                * metadata. Volatile (VOL) and WC flushes are not listed here.
                 *
-                * The metadata cache must always be invalidated for coherency
-                * between CB/DB and shaders. (metadata = HTILE, CMASK, DCC)
-                *
-                * TC must be invalidated on GFX9 only if the CB/DB surface is
-                * not pipe-aligned. If the surface is RB-aligned, it might not
-                * strictly be pipe-aligned since RB alignment takes precendence.
+                * TC    | TC_WB         = writeback & invalidate L2 & L1
+                * TC    | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
+                *         TC_WB | TC_NC = writeback L2 for MTYPE == NC
+                * TC            | TC_NC = invalidate L2 for MTYPE == NC
+                * TC    | TC_MD         = writeback & invalidate L2 metadata (DCC, etc.)
+                * TCL1                  = invalidate L1
                 */
-               tc_flags = EVENT_TC_WB_ACTION_ENA |
-                          EVENT_TC_MD_ACTION_ENA;
+               tc_flags = EVENT_TC_ACTION_ENA |
+                          EVENT_TC_MD_ACTION_ENA;
 
                /* Ideally flush TC together with CB/DB. */
                if (flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) {
-                       tc_flags |= EVENT_TC_ACTION_ENA |
-                                   EVENT_TCL1_ACTION_ENA;
+                       /* Writeback and invalidate everything in L2 & L1. */
+                       tc_flags = EVENT_TC_ACTION_ENA |
+                                  EVENT_TC_WB_ACTION_ENA;
+
 
                        /* Clear the flags. */
                        flush_bits &= ~(RADV_CMD_FLAG_INV_GLOBAL_L2 |
@@ -996,14 +1052,14 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
                assert(flush_cnt);
                uint32_t old_fence = (*flush_cnt)++;
 
-               si_cs_emit_write_event_eop(cs, predicated, chip_class, false, cb_db_event, tc_flags, 1,
+               si_cs_emit_write_event_eop(cs, false, chip_class, false, cb_db_event, tc_flags, 1,
                                           flush_va, old_fence, *flush_cnt);
-               si_emit_wait_fence(cs, predicated, flush_va, *flush_cnt, 0xffffffff);
+               si_emit_wait_fence(cs, false, flush_va, *flush_cnt, 0xffffffff);
        }
 
        /* VGT state sync */
        if (flush_bits & RADV_CMD_FLAG_VGT_FLUSH) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
+               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
        }
 
@@ -1016,13 +1072,13 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
                            RADV_CMD_FLAG_INV_GLOBAL_L2 |
                            RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) &&
            !is_mec) {
-               radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, predicated));
+               radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
                radeon_emit(cs, 0);
        }
 
        if ((flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) ||
            (chip_class <= CIK && (flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) {
-               si_emit_acquire_mem(cs, is_mec, predicated, chip_class >= GFX9,
+               si_emit_acquire_mem(cs, is_mec, false, chip_class >= GFX9,
                                    cp_coher_cntl |
                                    S_0085F0_TC_ACTION_ENA(1) |
                                    S_0085F0_TCL1_ACTION_ENA(1) |
@@ -1036,7 +1092,7 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
                         *
                         * WB doesn't work without NC.
                         */
-                       si_emit_acquire_mem(cs, is_mec, predicated,
+                       si_emit_acquire_mem(cs, is_mec, false,
                                            chip_class >= GFX9,
                                            cp_coher_cntl |
                                            S_0301F0_TC_WB_ACTION_ENA(1) |
@@ -1045,7 +1101,7 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
                }
                if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) {
                        si_emit_acquire_mem(cs, is_mec,
-                                           predicated, chip_class >= GFX9,
+                                           false, chip_class >= GFX9,
                                            cp_coher_cntl |
                                            S_0085F0_TCL1_ACTION_ENA(1));
                        cp_coher_cntl = 0;
@@ -1056,7 +1112,7 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
         * Therefore, it should be last. Done in PFP.
         */
        if (cp_coher_cntl)
-               si_emit_acquire_mem(cs, is_mec, predicated, chip_class >= GFX9, cp_coher_cntl);
+               si_emit_acquire_mem(cs, is_mec, false, chip_class >= GFX9, cp_coher_cntl);
 }
 
 void
@@ -1086,14 +1142,15 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
                ptr = &cmd_buffer->gfx9_fence_idx;
        }
        si_cs_emit_cache_flush(cmd_buffer->cs,
-                              cmd_buffer->state.predicating,
                               cmd_buffer->device->physical_device->rad_info.chip_class,
                               ptr, va,
                               radv_cmd_buffer_uses_mec(cmd_buffer),
                               cmd_buffer->state.flush_bits);
 
 
-       radv_cmd_buffer_trace_emit(cmd_buffer);
+       if (unlikely(cmd_buffer->device->trace_bo))
+               radv_cmd_buffer_trace_emit(cmd_buffer);
+
        cmd_buffer->state.flush_bits = 0;
 }
 
@@ -1217,7 +1274,8 @@ static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer,
                radeon_emit(cs, 0);
        }
 
-       radv_cmd_buffer_trace_emit(cmd_buffer);
+       if (unlikely(cmd_buffer->device->trace_bo))
+               radv_cmd_buffer_trace_emit(cmd_buffer);
 }
 
 void si_cp_dma_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va,