radv: Add shader prefetch.
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Wed, 19 Apr 2017 20:32:16 +0000 (22:32 +0200)
committerBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Wed, 19 Apr 2017 21:47:27 +0000 (23:47 +0200)
Gives me approximately a 2% perf increase in bot dota2 & talos.

Having descriptors (both sets and vertex buffers) prefetched
didn't help so I didn't include that.

Signed-off-by: Bas Nieuwenhuizen <basni@google.com>
Reviewed-by: Dave Airlie <airlied@redhat.com>
src/amd/vulkan/radv_cmd_buffer.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/si_cmd_buffer.c

index 8877f20b9ea7c3bf4c710c1648246a21095db2ee..958ae6e361e3ba86410b8e6af70008e4fd8cc5c4 100644 (file)
@@ -521,6 +521,7 @@ radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer,
        unsigned export_count;
 
        ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8);
+       si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
 
        export_count = MAX2(1, outinfo->param_exports);
        radeon_set_context_reg(cmd_buffer->cs, R_0286C4_SPI_VS_OUT_CONFIG,
@@ -568,6 +569,7 @@ radv_emit_hw_es(struct radv_cmd_buffer *cmd_buffer,
        uint64_t va = ws->buffer_get_va(shader->bo);
 
        ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8);
+       si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
 
        radeon_set_context_reg(cmd_buffer->cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
                               outinfo->esgs_itemsize / 4);
@@ -587,6 +589,7 @@ radv_emit_hw_ls(struct radv_cmd_buffer *cmd_buffer,
        uint32_t rsrc2 = shader->rsrc2;
 
        ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8);
+       si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
 
        radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B520_SPI_SHADER_PGM_LO_LS, 2);
        radeon_emit(cmd_buffer->cs, va >> 8);
@@ -610,6 +613,7 @@ radv_emit_hw_hs(struct radv_cmd_buffer *cmd_buffer,
        uint64_t va = ws->buffer_get_va(shader->bo);
 
        ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8);
+       si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
 
        radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B420_SPI_SHADER_PGM_LO_HS, 4);
        radeon_emit(cmd_buffer->cs, va >> 8);
@@ -743,6 +747,7 @@ radv_emit_geometry_shader(struct radv_cmd_buffer *cmd_buffer,
 
        va = ws->buffer_get_va(gs->bo);
        ws->cs_add_buffer(cmd_buffer->cs, gs->bo, 8);
+       si_cp_dma_prefetch(cmd_buffer, va, gs->code_size);
        radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4);
        radeon_emit(cmd_buffer->cs, va >> 8);
        radeon_emit(cmd_buffer->cs, va >> 40);
@@ -783,6 +788,7 @@ radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer,
 
        va = ws->buffer_get_va(ps->bo);
        ws->cs_add_buffer(cmd_buffer->cs, ps->bo, 8);
+       si_cp_dma_prefetch(cmd_buffer, va, ps->code_size);
 
        radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
        radeon_emit(cmd_buffer->cs, va >> 8);
index 78a0d6fee7d62b89dd5c374b9368c3ae385ce17a..045bb647d695ac7c2b0c590c25a65d0ebdd7671b 100644 (file)
@@ -835,6 +835,8 @@ void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer);
 void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
                           uint64_t src_va, uint64_t dest_va,
                           uint64_t size);
+void si_cp_dma_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
+                        unsigned size);
 void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
                            uint64_t size, unsigned value);
 void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer);
index 711dbde0aca66f461016bf5b372bb3027791ae9b..41625aa132dc5b4b8989d45117b757327883db44 100644 (file)
@@ -1022,6 +1022,16 @@ static void si_emit_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer,
        radv_cmd_buffer_trace_emit(cmd_buffer);
 }
 
+void si_cp_dma_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
+                        unsigned size)
+{
+       uint64_t aligned_va = va & ~(CP_DMA_ALIGNMENT - 1);
+       uint64_t aligned_size = ((va + size + CP_DMA_ALIGNMENT -1) & ~(CP_DMA_ALIGNMENT - 1)) - aligned_va;
+
+       si_emit_cp_dma_copy_buffer(cmd_buffer, aligned_va, aligned_va,
+                                  aligned_size, CIK_CP_DMA_USE_L2);
+}
+
 static void si_cp_dma_prepare(struct radv_cmd_buffer *cmd_buffer, uint64_t byte_count,
                              uint64_t remaining_size, unsigned *flags)
 {