radeonsi/gfx9: use ACQUIRE_MEM
[mesa.git] / src / gallium / drivers / radeonsi / si_state_draw.c
index 141dd8f7526aff1e7b7c23be4327346b1e745ea2..a80f021d64b0e17fce37483ef8bed77b38910450 100644 (file)
@@ -27,6 +27,7 @@
 #include "si_pipe.h"
 #include "radeon/r600_cs.h"
 #include "sid.h"
+#include "gfx9d.h"
 
 #include "util/u_index_modify.h"
 #include "util/u_upload_mgr.h"
@@ -379,7 +380,9 @@ si_get_init_multi_vgt_param(struct si_screen *sscreen,
                S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
                S_028AA8_WD_SWITCH_ON_EOP(sscreen->b.chip_class >= CIK ? wd_switch_on_eop : 0) |
                S_028AA8_MAX_PRIMGRP_IN_WAVE(sscreen->b.chip_class >= VI ?
-                                            max_primgroup_in_wave : 0);
+                                            max_primgroup_in_wave : 0) |
+               S_030960_EN_INST_OPT_BASIC(sscreen->b.chip_class >= GFX9) |
+               S_030960_EN_INST_OPT_ADV(sscreen->b.chip_class >= GFX9);
 }
 
 void si_init_ia_multi_vgt_param_table(struct si_context *sctx)
@@ -506,7 +509,9 @@ static void si_emit_draw_registers(struct si_context *sctx,
 
        /* Draw state. */
        if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
-               if (sctx->b.chip_class >= CIK)
+               if (sctx->b.chip_class >= GFX9)
+                       radeon_set_uconfig_reg_idx(cs, R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
+               else if (sctx->b.chip_class >= CIK)
                        radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
                else
                        radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
@@ -529,7 +534,13 @@ static void si_emit_draw_registers(struct si_context *sctx,
 
        /* Primitive restart. */
        if (info->primitive_restart != sctx->last_primitive_restart_en) {
-               radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, info->primitive_restart);
+               if (sctx->b.chip_class >= GFX9)
+                       radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
+                                              info->primitive_restart);
+               else
+                       radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
+                                              info->primitive_restart);
+
                sctx->last_primitive_restart_en = info->primitive_restart;
 
        }
@@ -578,28 +589,36 @@ static void si_emit_draw_packets(struct si_context *sctx,
        /* draw packet */
        if (info->indexed) {
                if (ib->index_size != sctx->last_index_size) {
-                       radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
+                       unsigned index_type;
 
                        /* index type */
                        switch (ib->index_size) {
                        case 1:
-                               radeon_emit(cs, V_028A7C_VGT_INDEX_8);
+                               index_type = V_028A7C_VGT_INDEX_8;
                                break;
                        case 2:
-                               radeon_emit(cs, V_028A7C_VGT_INDEX_16 |
-                                           (SI_BIG_ENDIAN && sctx->b.chip_class <= CIK ?
-                                                    V_028A7C_VGT_DMA_SWAP_16_BIT : 0));
+                               index_type = V_028A7C_VGT_INDEX_16 |
+                                            (SI_BIG_ENDIAN && sctx->b.chip_class <= CIK ?
+                                                     V_028A7C_VGT_DMA_SWAP_16_BIT : 0);
                                break;
                        case 4:
-                               radeon_emit(cs, V_028A7C_VGT_INDEX_32 |
-                                           (SI_BIG_ENDIAN && sctx->b.chip_class <= CIK ?
-                                                    V_028A7C_VGT_DMA_SWAP_32_BIT : 0));
+                               index_type = V_028A7C_VGT_INDEX_32 |
+                                            (SI_BIG_ENDIAN && sctx->b.chip_class <= CIK ?
+                                                     V_028A7C_VGT_DMA_SWAP_32_BIT : 0);
                                break;
                        default:
                                assert(!"unreachable");
                                return;
                        }
 
+                       if (sctx->b.chip_class >= GFX9) {
+                               radeon_set_uconfig_reg_idx(cs, R_03090C_VGT_INDEX_TYPE,
+                                                          2, index_type);
+                       } else {
+                               radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
+                               radeon_emit(cs, index_type);
+                       }
+
                        sctx->last_index_size = ib->index_size;
                }
 
@@ -718,7 +737,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
                        radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
                        radeon_emit(cs, index_max_size);
                        radeon_emit(cs, index_va);
-                       radeon_emit(cs, (index_va >> 32UL) & 0xFF);
+                       radeon_emit(cs, index_va >> 32);
                        radeon_emit(cs, info->count);
                        radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
                } else {
@@ -735,12 +754,23 @@ static void si_emit_surface_sync(struct r600_common_context *rctx,
 {
        struct radeon_winsys_cs *cs = rctx->gfx.cs;
 
-       /* ACQUIRE_MEM is only required on a compute ring. */
-       radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
-       radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
-       radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
-       radeon_emit(cs, 0);               /* CP_COHER_BASE */
-       radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
+       if (rctx->chip_class >= GFX9) {
+               /* Flush caches and wait for the caches to assert idle. */
+               radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));
+               radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
+               radeon_emit(cs, 0xffffffff);    /* CP_COHER_SIZE */
+               radeon_emit(cs, 0xffffff);      /* CP_COHER_SIZE_HI */
+               radeon_emit(cs, 0);             /* CP_COHER_BASE */
+               radeon_emit(cs, 0);             /* CP_COHER_BASE_HI */
+               radeon_emit(cs, 0x0000000A);    /* POLL_INTERVAL */
+       } else {
+               /* ACQUIRE_MEM is only required on a compute ring. */
+               radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
+               radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
+               radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
+               radeon_emit(cs, 0);               /* CP_COHER_BASE */
+               radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
+       }
 }
 
 void si_emit_cache_flush(struct si_context *sctx)
@@ -914,13 +944,59 @@ static void si_get_draw_start_count(struct si_context *sctx,
                                    unsigned *start, unsigned *count)
 {
        if (info->indirect) {
-               struct r600_resource *indirect =
-                       (struct r600_resource*)info->indirect;
-               int *data = r600_buffer_map_sync_with_rings(&sctx->b,
-                                       indirect, PIPE_TRANSFER_READ);
-                data += info->indirect_offset/sizeof(int);
-               *start = data[2];
-               *count = data[0];
+               unsigned indirect_count;
+               struct pipe_transfer *transfer;
+               unsigned begin, end;
+               unsigned map_size;
+               unsigned *data;
+
+               if (info->indirect_params) {
+                       data = pipe_buffer_map_range(&sctx->b.b,
+                                       info->indirect_params,
+                                       info->indirect_params_offset,
+                                       sizeof(unsigned),
+                                       PIPE_TRANSFER_READ, &transfer);
+
+                       indirect_count = *data;
+
+                       pipe_buffer_unmap(&sctx->b.b, transfer);
+               } else {
+                       indirect_count = info->indirect_count;
+               }
+
+               if (!indirect_count) {
+                       *start = *count = 0;
+                       return;
+               }
+
+               map_size = (indirect_count - 1) * info->indirect_stride + 3 * sizeof(unsigned);
+               data = pipe_buffer_map_range(&sctx->b.b, info->indirect,
+                                            info->indirect_offset, map_size,
+                                            PIPE_TRANSFER_READ, &transfer);
+
+               begin = UINT_MAX;
+               end = 0;
+
+               for (unsigned i = 0; i < indirect_count; ++i) {
+                       unsigned count = data[0];
+                       unsigned start = data[2];
+
+                       if (count > 0) {
+                               begin = MIN2(begin, start);
+                               end = MAX2(end, start + count);
+                       }
+
+                       data += info->indirect_stride / sizeof(unsigned);
+               }
+
+               pipe_buffer_unmap(&sctx->b.b, transfer);
+
+               if (begin < end) {
+                       *start = begin;
+                       *count = end - begin;
+               } else {
+                       *start = *count = 0;
+               }
        } else {
                *start = info->start;
                *count = info->count;
@@ -1123,6 +1199,12 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
        if (!si_upload_vertex_buffer_descriptors(sctx))
                return;
 
+       /* GFX9 scissor bug workaround. There is also a more efficient but
+        * more involved alternative workaround. */
+       if (sctx->b.chip_class == GFX9 &&
+           si_is_atom_dirty(sctx, &sctx->b.scissors.atom))
+               sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+
        /* Flush caches before the first state atom, which does L2 prefetches. */
        if (sctx->b.flags)
                si_emit_cache_flush(sctx);