gallium/radeon: eliminate fast color clear before sharing

[mesa.git] / src / gallium / drivers / radeonsi / si_cp_dma.c
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c

index 3f657ff96edb36b0599baefa77b566730b2b9bf6..dc62415823edac09fbf6db736e99302a5976ea17 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -46,7 +46,7 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
                                        uint64_t dst_va, uint64_t src_va,
                                        unsigned size, unsigned flags)
  {
-       struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
         uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
         uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
         uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
@@ -80,7 +80,7 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
                                         uint64_t dst_va, unsigned size,
                                         uint32_t clear_value, unsigned flags)
  {
-       struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
         uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
         uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
         uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
@@ -112,9 +112,9 @@ static unsigned get_flush_flags(struct si_context *sctx, bool is_framebuffer)
         if (is_framebuffer)
                 return SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
  
-       return SI_CONTEXT_INV_TC_L1 |
-              (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
-              SI_CONTEXT_INV_KCACHE;
+       return SI_CONTEXT_INV_SMEM_L1 |
+              SI_CONTEXT_INV_VMEM_L1 |
+              (sctx->b.chip_class == SI ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
  }
  
  static unsigned get_tc_l2_flag(struct si_context *sctx, bool is_framebuffer)
@@ -129,11 +129,11 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
         si_need_cs_space(sctx);
  
         /* This must be done after need_cs_space. */
-       radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+       radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
                                   (struct r600_resource*)dst,
                                   RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
         if (src)
-               radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+               radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
                                           (struct r600_resource*)src,
                                           RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
  
@@ -176,8 +176,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
  
         /* Fallback for unaligned clears. */
         if (offset % 4 != 0 || size % 4 != 0) {
-               uint8_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
-                                                     sctx->b.rings.gfx.cs,
+               uint8_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->buf,
+                                                     sctx->b.gfx.cs,
                                                       PIPE_TRANSFER_WRITE);
                 map += offset;
                 for (unsigned i = 0; i < size; i++) {
@@ -273,22 +273,26 @@ void si_copy_buffer(struct si_context *sctx,
         dst_offset += r600_resource(dst)->gpu_address;
         src_offset += r600_resource(src)->gpu_address;
  
-       /* If the size is not aligned, we must add a dummy copy at the end
-        * just to align the internal counter. Otherwise, the DMA engine
-        * would slow down by an order of magnitude for following copies.
-        */
-       if (size % CP_DMA_ALIGNMENT)
-               realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT);
-
-       /* If the copy begins unaligned, we must start copying from the next
-        * aligned block and the skipped part should be copied after everything
-        * else has been copied. Only the src alignment matters, not dst.
-        */
-       if (src_offset % CP_DMA_ALIGNMENT) {
-               skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT);
-               /* The main part will be skipped if the size is too small. */
-               skipped_size = MIN2(skipped_size, size);
-               size -= skipped_size;
+       /* The workarounds aren't needed on Fiji and beyond. */
+       if (sctx->b.family <= CHIP_CARRIZO ||
+           sctx->b.family == CHIP_STONEY) {
+               /* If the size is not aligned, we must add a dummy copy at the end
+                * just to align the internal counter. Otherwise, the DMA engine
+                * would slow down by an order of magnitude for following copies.
+                */
+               if (size % CP_DMA_ALIGNMENT)
+                       realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT);
+
+               /* If the copy begins unaligned, we must start copying from the next
+                * aligned block and the skipped part should be copied after everything
+                * else has been copied. Only the src alignment matters, not dst.
+                */
+               if (src_offset % CP_DMA_ALIGNMENT) {
+                       skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT);
+                       /* The main part will be skipped if the size is too small. */
+                       skipped_size = MIN2(skipped_size, size);
+                       size -= skipped_size;
+               }
         }
  
         /* Flush the caches. */