radv: emit {CB,DB}_RMI_L2_CACHE_CONTROL at framebuffer time
[mesa.git] / src / amd / vulkan / radv_cmd_buffer.c
index 92556e3b5e4043301ffa783c49cf55d04257882f..f6c053cb0ad9e302722e7ba2bd561241dfb0eec9 100644 (file)
@@ -483,7 +483,8 @@ radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer,
                                       RADEON_DOMAIN_GTT,
                                       RADEON_FLAG_CPU_ACCESS|
                                       RADEON_FLAG_NO_INTERPROCESS_SHARING |
-                                      RADEON_FLAG_32BIT,
+                                      RADEON_FLAG_32BIT |
+                                      RADEON_FLAG_GTT_WC,
                                       RADV_BO_PRIORITY_UPLOAD_BUFFER);
 
        if (!bo) {
@@ -1735,6 +1736,17 @@ radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer,
        radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface);
 
        if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
+               /* Enable HTILE caching in L2 for small chips. */
+               unsigned meta_write_policy, meta_read_policy;
+               /* TODO: investigate whether LRU improves performance on other chips too */
+               if (cmd_buffer->device->physical_device->rad_info.num_render_backends <= 4) {
+                       meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */
+                       meta_read_policy =  V_02807C_CACHE_LRU_RD; /* cache reads */
+               } else {
+                       meta_write_policy = V_02807C_CACHE_STREAM_WR; /* write combine */
+                       meta_read_policy =  V_02807C_CACHE_NOA_RD;    /* don't cache reads */
+               }
+
                radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
                radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size);
 
@@ -1747,12 +1759,20 @@ radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer,
                radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
                radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
 
-               radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
+               radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 6);
                radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
                radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
                radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
                radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
                radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32);
+               radeon_emit(cmd_buffer->cs,
+                           S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
+                           S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
+                           S_02807C_HTILE_WR_POLICY(meta_write_policy) |
+                           S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
+                           S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) |
+                           S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) |
+                           S_02807C_HTILE_RD_POLICY(meta_read_policy));
        } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
                radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
                radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
@@ -2293,6 +2313,29 @@ radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
                                       S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
        }
 
+       if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
+               /* Enable CMASK/FMASK/DCC caching in L2 for small chips. */
+               unsigned meta_write_policy, meta_read_policy;
+               /* TODO: investigate whether LRU improves performance on other chips too */
+               if (cmd_buffer->device->physical_device->rad_info.num_render_backends <= 4) {
+                       meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */
+                       meta_read_policy =  V_02807C_CACHE_LRU_RD; /* cache reads */
+               } else {
+                       meta_write_policy = V_02807C_CACHE_STREAM_WR; /* write combine */
+                       meta_read_policy =  V_02807C_CACHE_NOA_RD;    /* don't cache reads */
+               }
+
+               radeon_set_context_reg(cmd_buffer->cs, R_028410_CB_RMI_GL2_CACHE_CONTROL,
+                                      S_028410_CMASK_WR_POLICY(meta_write_policy) |
+                                      S_028410_FMASK_WR_POLICY(meta_write_policy) |
+                                      S_028410_DCC_WR_POLICY(meta_write_policy)  |
+                                      S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) |
+                                      S_028410_CMASK_RD_POLICY(meta_read_policy) |
+                                      S_028410_FMASK_RD_POLICY(meta_read_policy) |
+                                      S_028410_DCC_RD_POLICY(meta_read_policy) |
+                                      S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD));
+       }
+
        if (cmd_buffer->device->dfsm_allowed) {
                radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
@@ -2716,13 +2759,10 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
                        if (cmd_buffer->device->physical_device->rad_info.chip_class != GFX8 && stride)
                                num_records /= stride;
 
-                       desc[0] = va;
-                       desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
-                       desc[2] = num_records;
-                       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                                 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                                 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                                 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+                       uint32_t rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+                                             S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                                             S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+                                             S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
 
                        if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
                                /* OOB_SELECT chooses the out-of-bounds check:
@@ -2731,13 +2771,18 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
                                 */
                                int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
 
-                               desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_UINT) |
-                                          S_008F0C_OOB_SELECT(oob_select) |
-                                          S_008F0C_RESOURCE_LEVEL(1);
+                               rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_UINT) |
+                                            S_008F0C_OOB_SELECT(oob_select) |
+                                            S_008F0C_RESOURCE_LEVEL(1);
                        } else {
-                               desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
-                                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+                               rsrc_word3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+                                            S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
                        }
+
+                       desc[0] = va;
+                       desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
+                       desc[2] = num_records;
+                       desc[3] = rsrc_word3;
                }
 
                va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
@@ -2828,21 +2873,23 @@ radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
                        if (cmd_buffer->device->physical_device->use_ngg_streamout)
                                size = buffer->size - sb[i].offset;
 
-                       desc[0] = va;
-                       desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
-                       desc[2] = size;
-                       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                                 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                                 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                                 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+                       uint32_t rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+                                             S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                                             S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+                                             S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
 
                        if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
-                               desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-                                          S_008F0C_RESOURCE_LEVEL(1);
+                               rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                                             S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
+                                             S_008F0C_RESOURCE_LEVEL(1);
                        } else {
-                               desc[3] |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+                               rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
                        }
+
+                       desc[0] = va;
+                       desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
+                       desc[2] = size;
+                       desc[3] = rsrc_word3;
                }
 
                va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
@@ -3543,6 +3590,7 @@ radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer,
                }
 
                state->attachments[i].current_layout = att->initial_layout;
+               state->attachments[i].current_in_render_loop = false;
                state->attachments[i].current_stencil_layout = att->stencil_initial_layout;
                state->attachments[i].sample_location.count = 0;