- uint64_t main_dst_offset, main_src_offset;
- unsigned skipped_size = 0;
- unsigned realign_size = 0;
- unsigned gds_flags = (dst ? 0 : CP_DMA_DST_IS_GDS) |
- (src ? 0 : CP_DMA_SRC_IS_GDS);
- bool is_first = true;
-
- assert(size);
-
- if (dst) {
- /* Skip this for the L2 prefetch. */
- if (dst != src || dst_offset != src_offset) {
- /* Mark the buffer range of destination as valid (initialized),
- * so that transfer_map knows it should wait for the GPU when mapping
- * that range. */
- util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset,
- dst_offset + size);
- }
-
- dst_offset += si_resource(dst)->gpu_address;
- }
- if (src)
- src_offset += si_resource(src)->gpu_address;
-
- /* The workarounds aren't needed on Fiji and beyond. */
- if (sctx->family <= CHIP_CARRIZO ||
- sctx->family == CHIP_STONEY) {
- /* If the size is not aligned, we must add a dummy copy at the end
- * just to align the internal counter. Otherwise, the DMA engine
- * would slow down by an order of magnitude for following copies.
- */
- if (size % SI_CPDMA_ALIGNMENT)
- realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT);
-
- /* If the copy begins unaligned, we must start copying from the next
- * aligned block and the skipped part should be copied after everything
- * else has been copied. Only the src alignment matters, not dst.
- *
- * GDS doesn't need the source address to be aligned.
- */
- if (src && src_offset % SI_CPDMA_ALIGNMENT) {
- skipped_size = SI_CPDMA_ALIGNMENT - (src_offset % SI_CPDMA_ALIGNMENT);
- /* The main part will be skipped if the size is too small. */
- skipped_size = MIN2(skipped_size, size);
- size -= skipped_size;
- }
- }
-
- /* Flush the caches. */
- if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
- sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH |
- si_get_flush_flags(sctx, coher, cache_policy);
- }
-
- /* This is the main part doing the copying. Src is always aligned. */
- main_dst_offset = dst_offset + skipped_size;
- main_src_offset = src_offset + skipped_size;
-
- while (size) {
- unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
- unsigned dma_flags = gds_flags;
-
- si_cp_dma_prepare(sctx, dst, src, byte_count,
- size + skipped_size + realign_size,
- user_flags, coher, &is_first, &dma_flags);
-
- si_emit_cp_dma(sctx, sctx->gfx_cs, main_dst_offset, main_src_offset,
- byte_count, dma_flags, cache_policy);
-
- size -= byte_count;
- main_src_offset += byte_count;
- main_dst_offset += byte_count;
- }
-
- /* Copy the part we skipped because src wasn't aligned. */
- if (skipped_size) {
- unsigned dma_flags = gds_flags;
-
- si_cp_dma_prepare(sctx, dst, src, skipped_size,
- skipped_size + realign_size, user_flags,
- coher, &is_first, &dma_flags);
-
- si_emit_cp_dma(sctx, sctx->gfx_cs, dst_offset, src_offset, skipped_size,
- dma_flags, cache_policy);
- }
-
- /* Finally, realign the engine if the size wasn't aligned. */
- if (realign_size) {
- si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher,
- cache_policy, &is_first);
- }
-
- if (dst && cache_policy != L2_BYPASS)
- si_resource(dst)->TC_L2_dirty = true;
-
- /* If it's not a prefetch or GDS copy... */
- if (dst && src && (dst != src || dst_offset != src_offset)) {
- sctx->num_cp_dma_calls++;
- si_prim_discard_signal_next_compute_ib_start(sctx);
- }
+ uint64_t main_dst_offset, main_src_offset;
+ unsigned skipped_size = 0;
+ unsigned realign_size = 0;
+ unsigned gds_flags = (dst ? 0 : CP_DMA_DST_IS_GDS) | (src ? 0 : CP_DMA_SRC_IS_GDS);
+ bool is_first = true;
+
+ assert(size);
+
+ if (dst) {
+ /* Skip this for the L2 prefetch. */
+ if (dst != src || dst_offset != src_offset) {
+ /* Mark the buffer range of destination as valid (initialized),
+ * so that transfer_map knows it should wait for the GPU when mapping
+ * that range. */
+ util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size);
+ }
+
+ dst_offset += si_resource(dst)->gpu_address;
+ }
+ if (src)
+ src_offset += si_resource(src)->gpu_address;
+
+ /* The workarounds aren't needed on Fiji and beyond. */
+ if (sctx->family <= CHIP_CARRIZO || sctx->family == CHIP_STONEY) {
+ /* If the size is not aligned, we must add a dummy copy at the end
+ * just to align the internal counter. Otherwise, the DMA engine
+ * would slow down by an order of magnitude for following copies.
+ */
+ if (size % SI_CPDMA_ALIGNMENT)
+ realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT);
+
+ /* If the copy begins unaligned, we must start copying from the next
+ * aligned block and the skipped part should be copied after everything
+ * else has been copied. Only the src alignment matters, not dst.
+ *
+ * GDS doesn't need the source address to be aligned.
+ */
+ if (src && src_offset % SI_CPDMA_ALIGNMENT) {
+ skipped_size = SI_CPDMA_ALIGNMENT - (src_offset % SI_CPDMA_ALIGNMENT);
+ /* The main part will be skipped if the size is too small. */
+ skipped_size = MIN2(skipped_size, size);
+ size -= skipped_size;
+ }
+ }
+
+ /* TMZ handling */
+ if (unlikely(sctx->ws->ws_is_secure(sctx->ws) &&
+ !(user_flags & SI_CPDMA_SKIP_TMZ))) {
+ bool secure = src && (si_resource(src)->flags & RADEON_FLAG_ENCRYPTED);
+ assert(!secure || (!dst || (si_resource(dst)->flags & RADEON_FLAG_ENCRYPTED)));
+ if (secure != sctx->ws->cs_is_secure(sctx->gfx_cs)) {
+ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ sctx->ws->cs_set_secure(sctx->gfx_cs, secure);
+ }
+ }
+
+ /* Flush the caches. */
+ if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+ si_get_flush_flags(sctx, coher, cache_policy);
+ }
+
+ /* This is the main part doing the copying. Src is always aligned. */
+ main_dst_offset = dst_offset + skipped_size;
+ main_src_offset = src_offset + skipped_size;
+
+ while (size) {
+ unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
+ unsigned dma_flags = gds_flags;
+
+ si_cp_dma_prepare(sctx, dst, src, byte_count, size + skipped_size + realign_size, user_flags,
+ coher, &is_first, &dma_flags);
+
+ si_emit_cp_dma(sctx, sctx->gfx_cs, main_dst_offset, main_src_offset, byte_count, dma_flags,
+ cache_policy);
+
+ size -= byte_count;
+ main_src_offset += byte_count;
+ main_dst_offset += byte_count;
+ }
+
+ /* Copy the part we skipped because src wasn't aligned. */
+ if (skipped_size) {
+ unsigned dma_flags = gds_flags;
+
+ si_cp_dma_prepare(sctx, dst, src, skipped_size, skipped_size + realign_size, user_flags,
+ coher, &is_first, &dma_flags);
+
+ si_emit_cp_dma(sctx, sctx->gfx_cs, dst_offset, src_offset, skipped_size, dma_flags,
+ cache_policy);
+ }
+
+ /* Finally, realign the engine if the size wasn't aligned. */
+ if (realign_size) {
+ si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher, cache_policy, &is_first);
+ }
+
+ if (dst && cache_policy != L2_BYPASS)
+ si_resource(dst)->TC_L2_dirty = true;
+
+ /* If it's not a prefetch or GDS copy... */
+ if (dst && src && (dst != src || dst_offset != src_offset)) {
+ sctx->num_cp_dma_calls++;
+ si_prim_discard_signal_next_compute_ib_start(sctx);
+ }