radeonsi: move si_shader_binary_upload out of si_compile_llvm

[mesa.git] / src / gallium / drivers / radeonsi / si_cp_dma.c
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c

index 2e39a24071bcc54f4846230f2e2c6d203f415082..dc62415823edac09fbf6db736e99302a5976ea17 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -46,7 +46,7 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
                                        uint64_t dst_va, uint64_t src_va,
                                        unsigned size, unsigned flags)
  {
-       struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
         uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
         uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
         uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
@@ -64,7 +64,7 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
                 radeon_emit(cs, src_va >> 32);          /* SRC_ADDR_HI [31:0] */
                 radeon_emit(cs, dst_va);                /* DST_ADDR_LO [31:0] */
                 radeon_emit(cs, dst_va >> 32);          /* DST_ADDR_HI [31:0] */
-               radeon_emit(cs, size | raw_wait);       /* COMMAND [29:22] | BYTE_COUNT [20:0] */
+               radeon_emit(cs, size | wr_confirm | raw_wait);  /* COMMAND [29:22] | BYTE_COUNT [20:0] */
         } else {
                 radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
                 radeon_emit(cs, src_va);                        /* SRC_ADDR_LO [31:0] */
@@ -80,7 +80,7 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
                                         uint64_t dst_va, unsigned size,
                                         uint32_t clear_value, unsigned flags)
  {
-       struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
         uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
         uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
         uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
@@ -96,7 +96,7 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
                 radeon_emit(cs, 0);
                 radeon_emit(cs, dst_va);                /* DST_ADDR_LO [31:0] */
                 radeon_emit(cs, dst_va >> 32);          /* DST_ADDR_HI [15:0] */
-               radeon_emit(cs, size | raw_wait);       /* COMMAND [29:22] | BYTE_COUNT [20:0] */
+               radeon_emit(cs, size | wr_confirm | raw_wait);  /* COMMAND [29:22] | BYTE_COUNT [20:0] */
         } else {
                 radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
                 radeon_emit(cs, clear_value);           /* DATA [31:0] */
@@ -112,9 +112,9 @@ static unsigned get_flush_flags(struct si_context *sctx, bool is_framebuffer)
         if (is_framebuffer)
                 return SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
  
-       return SI_CONTEXT_INV_TC_L1 |
-              (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
-              SI_CONTEXT_INV_KCACHE;
+       return SI_CONTEXT_INV_SMEM_L1 |
+              SI_CONTEXT_INV_VMEM_L1 |
+              (sctx->b.chip_class == SI ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
  }
  
  static unsigned get_tc_l2_flag(struct si_context *sctx, bool is_framebuffer)
@@ -129,11 +129,11 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
         si_need_cs_space(sctx);
  
         /* This must be done after need_cs_space. */
-       radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+       radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
                                   (struct r600_resource*)dst,
                                   RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
         if (src)
-               radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+               radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
                                           (struct r600_resource*)src,
                                           RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
  
@@ -152,8 +152,10 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
                 *flags |= R600_CP_DMA_SYNC;
  }
  
+/* Alignment for optimal performance. */
+#define CP_DMA_ALIGNMENT       32
  /* The max number of bytes to copy per packet. */
-#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
+#define CP_DMA_MAX_BYTE_COUNT  ((1 << 21) - CP_DMA_ALIGNMENT)
  
  static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
                             unsigned offset, unsigned size, unsigned value,
@@ -174,12 +176,14 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
  
         /* Fallback for unaligned clears. */
         if (offset % 4 != 0 || size % 4 != 0) {
-               uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
-                                                      sctx->b.rings.gfx.cs,
-                                                      PIPE_TRANSFER_WRITE);
-               size /= 4;
-               for (unsigned i = 0; i < size; i++)
-                       *map++ = value;
+               uint8_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->buf,
+                                                     sctx->b.gfx.cs,
+                                                     PIPE_TRANSFER_WRITE);
+               map += offset;
+               for (unsigned i = 0; i < size; i++) {
+                       unsigned byte_within_dword = (offset + i) % 4;
+                       *map++ = (value >> (byte_within_dword * 8)) & 0xff;
+               }
                 return;
         }
  
@@ -209,11 +213,51 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
                 r600_resource(dst)->TC_L2_dirty = true;
  }
  
+/**
+ * Realign the CP DMA engine. This must be done after a copy with an unaligned
+ * size.
+ *
+ * \param size  Remaining size to the CP DMA alignment.
+ */
+static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size)
+{
+       uint64_t va;
+       unsigned dma_flags = 0;
+       unsigned scratch_size = CP_DMA_ALIGNMENT * 2;
+
+       assert(size < CP_DMA_ALIGNMENT);
+
+       /* Use the scratch buffer as the dummy buffer. The 3D engine should be
+        * idle at this point.
+        */
+       if (!sctx->scratch_buffer ||
+           sctx->scratch_buffer->b.b.width0 < scratch_size) {
+               r600_resource_reference(&sctx->scratch_buffer, NULL);
+               sctx->scratch_buffer =
+                       si_resource_create_custom(&sctx->screen->b.b,
+                                                 PIPE_USAGE_DEFAULT,
+                                                 scratch_size);
+               if (!sctx->scratch_buffer)
+                       return;
+               sctx->emit_scratch_reloc = true;
+       }
+
+       si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
+                         &sctx->scratch_buffer->b.b, size, size, &dma_flags);
+
+       va = sctx->scratch_buffer->gpu_address;
+       si_emit_cp_dma_copy_buffer(sctx, va, va + CP_DMA_ALIGNMENT, size,
+                                  dma_flags);
+}
+
  void si_copy_buffer(struct si_context *sctx,
                     struct pipe_resource *dst, struct pipe_resource *src,
                     uint64_t dst_offset, uint64_t src_offset, unsigned size,
                     bool is_framebuffer)
  {
+       uint64_t main_dst_offset, main_src_offset;
+       unsigned skipped_size = 0;
+       unsigned realign_size = 0;
         unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer);
         unsigned flush_flags = get_flush_flags(sctx, is_framebuffer);
  
@@ -229,22 +273,67 @@ void si_copy_buffer(struct si_context *sctx,
         dst_offset += r600_resource(dst)->gpu_address;
         src_offset += r600_resource(src)->gpu_address;
  
+       /* The workarounds aren't needed on Fiji and beyond. */
+       if (sctx->b.family <= CHIP_CARRIZO ||
+           sctx->b.family == CHIP_STONEY) {
+               /* If the size is not aligned, we must add a dummy copy at the end
+                * just to align the internal counter. Otherwise, the DMA engine
+                * would slow down by an order of magnitude for following copies.
+                */
+               if (size % CP_DMA_ALIGNMENT)
+                       realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT);
+
+               /* If the copy begins unaligned, we must start copying from the next
+                * aligned block and the skipped part should be copied after everything
+                * else has been copied. Only the src alignment matters, not dst.
+                */
+               if (src_offset % CP_DMA_ALIGNMENT) {
+                       skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT);
+                       /* The main part will be skipped if the size is too small. */
+                       skipped_size = MIN2(skipped_size, size);
+                       size -= skipped_size;
+               }
+       }
+
         /* Flush the caches. */
         sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
  
+       /* This is the main part doing the copying. Src is always aligned. */
+       main_dst_offset = dst_offset + skipped_size;
+       main_src_offset = src_offset + skipped_size;
+
         while (size) {
                 unsigned dma_flags = tc_l2_flag;
                 unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
  
-               si_cp_dma_prepare(sctx, dst, src, byte_count, size, &dma_flags);
+               si_cp_dma_prepare(sctx, dst, src, byte_count,
+                                 size + skipped_size + realign_size,
+                                 &dma_flags);
  
-               si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, dma_flags);
+               si_emit_cp_dma_copy_buffer(sctx, main_dst_offset, main_src_offset,
+                                          byte_count, dma_flags);
  
                 size -= byte_count;
-               src_offset += byte_count;
-               dst_offset += byte_count;
+               main_src_offset += byte_count;
+               main_dst_offset += byte_count;
         }
  
+       /* Copy the part we skipped because src wasn't aligned. */
+       if (skipped_size) {
+               unsigned dma_flags = tc_l2_flag;
+
+               si_cp_dma_prepare(sctx, dst, src, skipped_size,
+                                 skipped_size + realign_size,
+                                 &dma_flags);
+
+               si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset,
+                                          skipped_size, dma_flags);
+       }
+
+       /* Finally, realign the engine if the size wasn't aligned. */
+       if (realign_size)
+               si_cp_dma_realign_engine(sctx, realign_size);
+
         /* Flush the caches again in case the 3D engine has been prefetching
          * the resource. */
         sctx->b.flags |= flush_flags;