radeonsi: set exact shader buffer read/write usage in CS
[mesa.git] / src / gallium / drivers / radeonsi / si_state.c
index 8b2e6e57f4536881b1d24635bd01b8184ebd8b63..757c17f7df8ccf1e961627052a973e7155d244dc 100644 (file)
@@ -113,13 +113,15 @@ static void si_emit_cb_render_state(struct si_context *sctx)
                                  blend &&
                                  blend->blend_enable_4bit & cb_target_mask &&
                                  sctx->framebuffer.nr_samples >= 2;
+               unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark;
 
                radeon_opt_set_context_reg(
                                sctx, R_028424_CB_DCC_CONTROL,
                                SI_TRACKED_CB_DCC_CONTROL,
                                S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1) |
-                               S_028424_OVERWRITE_COMBINER_WATERMARK(4) |
-                               S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable));
+                               S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
+                               S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) |
+                               S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->has_dcc_constant_encode));
        }
 
        /* RB+ register settings. */
@@ -472,10 +474,11 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 
        si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK,
                       S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) |
-                      S_028B70_ALPHA_TO_MASK_OFFSET0(2) |
-                      S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
-                      S_028B70_ALPHA_TO_MASK_OFFSET2(2) |
-                      S_028B70_ALPHA_TO_MASK_OFFSET3(2));
+                      S_028B70_ALPHA_TO_MASK_OFFSET0(3) |
+                      S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
+                      S_028B70_ALPHA_TO_MASK_OFFSET2(0) |
+                      S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
+                      S_028B70_OFFSET_ROUND(1));
 
        if (state->alpha_to_coverage)
                blend->need_src_alpha_4bit |= 0xf;
@@ -898,7 +901,7 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 
        if (state->point_size_per_vertex) {
                psize_min = util_get_min_point_size(state);
-               psize_max = 8192;
+               psize_max = SI_MAX_POINT_SIZE;
        } else {
                /* Force the point size to be as if the vertex output was disabled. */
                psize_min = state->point_size;
@@ -1353,6 +1356,14 @@ void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
 
        si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
        si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
+
+       st->saved_ssbo_writable_mask = 0;
+
+       for (unsigned i = 0; i < 3; i++) {
+               if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+                   (1u << si_get_shaderbuf_slot(i)))
+                       st->saved_ssbo_writable_mask |= 1 << i;
+       }
 }
 
 static void si_emit_db_render_state(struct si_context *sctx)
@@ -2149,7 +2160,7 @@ static boolean si_is_format_supported(struct pipe_screen *screen,
        unsigned retval = 0;
 
        if (target >= PIPE_MAX_TEXTURE_TYPES) {
-               PRINT_ERR("r600: unsupported texture type %d\n", target);
+               PRINT_ERR("radeonsi: unsupported texture type %d\n", target);
                return false;
        }
 
@@ -2804,9 +2815,11 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
         *
         * Only flush and wait for CB if there is actually a bound color buffer.
         */
-       if (sctx->framebuffer.uncompressed_cb_mask)
+       if (sctx->framebuffer.uncompressed_cb_mask) {
                si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
-                                          sctx->framebuffer.CB_has_shader_readable_metadata);
+                                          sctx->framebuffer.CB_has_shader_readable_metadata,
+                                          sctx->framebuffer.all_DCC_pipe_aligned);
+       }
 
        sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
 
@@ -2855,6 +2868,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
        sctx->framebuffer.any_dst_linear = false;
        sctx->framebuffer.CB_has_shader_readable_metadata = false;
        sctx->framebuffer.DB_has_shader_readable_metadata = false;
+       sctx->framebuffer.all_DCC_pipe_aligned = true;
+       unsigned num_bpp64_colorbufs = 0;
 
        for (i = 0; i < state->nr_cbufs; i++) {
                if (!state->cbufs[i])
@@ -2901,10 +2916,17 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 
                if (tex->surface.is_linear)
                        sctx->framebuffer.any_dst_linear = true;
+               if (tex->surface.bpe >= 8)
+                       num_bpp64_colorbufs++;
 
-               if (vi_dcc_enabled(tex, surf->base.u.tex.level))
+               if (vi_dcc_enabled(tex, surf->base.u.tex.level)) {
                        sctx->framebuffer.CB_has_shader_readable_metadata = true;
 
+                       if (sctx->chip_class >= GFX9 &&
+                           !tex->surface.u.gfx9.dcc.pipe_aligned)
+                               sctx->framebuffer.all_DCC_pipe_aligned = false;
+               }
+
                si_context_add_resource_size(sctx, surf->base.texture);
 
                p_atomic_inc(&tex->framebuffers_bound);
@@ -2916,6 +2938,14 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
                }
        }
 
+       /* For optimal DCC performance. */
+       if (sctx->chip_class == VI)
+               sctx->framebuffer.dcc_overwrite_combiner_watermark = 4;
+       else if (num_bpp64_colorbufs >= 5)
+               sctx->framebuffer.dcc_overwrite_combiner_watermark = 8;
+       else
+               sctx->framebuffer.dcc_overwrite_combiner_watermark = 6;
+
        struct si_texture *zstex = NULL;
 
        if (state->zsbuf) {
@@ -3557,7 +3587,7 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
  * @param state 256-bit descriptor; only the high 128 bits are filled in
  */
 void
-si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
+si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
                          enum pipe_format format,
                          unsigned offset, unsigned size,
                          uint32_t *state)
@@ -3600,14 +3630,11 @@ si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
         * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units.
         * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE.
         */
-       if (screen->info.chip_class >= GFX9)
-               /* When vindex == 0, LLVM sets IDXEN = 0, thus changing units
+       if (screen->info.chip_class >= GFX9 && HAVE_LLVM < 0x0800)
+               /* When vindex == 0, LLVM < 8.0 sets IDXEN = 0, thus changing units
                 * from STRIDE to bytes. This works around it by setting
                 * NUM_RECORDS to at least the size of one element, so that
                 * the first element is readable when IDXEN == 0.
-                *
-                * TODO: Fix this in LLVM, but do we need a new intrinsic where
-                *       IDXEN is enforced?
                 */
                num_records = num_records ? MAX2(num_records, stride) : 0;
        else if (screen->info.chip_class == VI)
@@ -4051,7 +4078,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
        /* Buffer resource. */
        if (texture->target == PIPE_BUFFER) {
                si_make_buffer_descriptor(sctx->screen,
-                                         r600_resource(texture),
+                                         si_resource(texture),
                                          state->format,
                                          state->u.buf.offset,
                                          state->u.buf.size,
@@ -4571,7 +4598,7 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
                unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched);
 
                v->instance_divisor_factor_buffer =
-                       (struct r600_resource*)
+                       (struct si_resource*)
                        pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
                                           num_divisors * sizeof(divisor_factors[0]));
                if (!v->instance_divisor_factor_buffer) {
@@ -4620,7 +4647,7 @@ static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
 
        if (sctx->vertex_elements == state)
                sctx->vertex_elements = NULL;
-       r600_resource_reference(&v->instance_divisor_factor_buffer, NULL);
+       si_resource_reference(&v->instance_divisor_factor_buffer, NULL);
        FREE(state);
 }
 
@@ -4645,7 +4672,7 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
                        dsti->stride = src->stride;
                        si_context_add_resource_size(sctx, buf);
                        if (buf)
-                               r600_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
+                               si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
                }
        } else {
                for (i = 0; i < count; i++) {
@@ -4674,7 +4701,7 @@ static void si_set_tess_state(struct pipe_context *ctx,
        cb.user_buffer = NULL;
        cb.buffer_size = sizeof(array);
 
-       si_upload_const_buffer(sctx, (struct r600_resource**)&cb.buffer,
+       si_upload_const_buffer(sctx, (struct si_resource**)&cb.buffer,
                               (void*)array, sizeof(array),
                               &cb.buffer_offset);
 
@@ -4689,16 +4716,21 @@ static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
        si_update_fb_dirtiness_after_rendering(sctx);
 
        /* Multisample surfaces are flushed in si_decompress_textures. */
-       if (sctx->framebuffer.uncompressed_cb_mask)
+       if (sctx->framebuffer.uncompressed_cb_mask) {
                si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
-                                          sctx->framebuffer.CB_has_shader_readable_metadata);
+                                          sctx->framebuffer.CB_has_shader_readable_metadata,
+                                          sctx->framebuffer.all_DCC_pipe_aligned);
+       }
 }
 
 /* This only ensures coherency for shader image/buffer stores. */
-static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
+void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 {
        struct si_context *sctx = (struct si_context *)ctx;
 
+       if (!(flags & ~PIPE_BARRIER_UPDATE))
+               return;
+
        /* Subsequent commands must wait for all shader invocations to
         * complete. */
        sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
@@ -4809,14 +4841,11 @@ void si_init_state_functions(struct si_context *sctx)
        sctx->b.set_vertex_buffers = si_set_vertex_buffers;
 
        sctx->b.texture_barrier = si_texture_barrier;
-       sctx->b.memory_barrier = si_memory_barrier;
        sctx->b.set_min_samples = si_set_min_samples;
        sctx->b.set_tess_state = si_set_tess_state;
 
        sctx->b.set_active_query_state = si_set_active_query_state;
 
-       sctx->b.draw_vbo = si_draw_vbo;
-
        si_init_config(sctx);
 }
 
@@ -4899,8 +4928,9 @@ static void si_init_config(struct si_context *sctx)
        bool has_clear_state = sscreen->has_clear_state;
        struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
 
-       /* Only SI can disable CLEAR_STATE for now. */
-       assert(has_clear_state || sscreen->info.chip_class == SI);
+       /* SI, radeon kernel disabled CLEAR_STATE. */
+       assert(has_clear_state || sscreen->info.chip_class == SI ||
+              sscreen->info.drm_major != 3);
 
        if (!pm4)
                return;
@@ -5087,6 +5117,7 @@ static void si_init_config(struct si_context *sctx)
                        pc_lines = 4096;
                        break;
                case CHIP_RAVEN:
+               case CHIP_RAVEN2:
                        pc_lines = 1024;
                        break;
                default: