2 * Copyright 2018 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
28 static void si_dma_emit_wait_idle(struct si_context
*sctx
)
30 struct radeon_cmdbuf
*cs
= sctx
->sdma_cs
;
32 /* NOP waits for idle. */
33 if (sctx
->chip_class
>= GFX7
)
34 radeon_emit(cs
, 0x00000000); /* NOP */
36 radeon_emit(cs
, 0xf0000000); /* NOP */
39 void si_dma_emit_timestamp(struct si_context
*sctx
, struct si_resource
*dst
, uint64_t offset
)
41 struct radeon_cmdbuf
*cs
= sctx
->sdma_cs
;
42 uint64_t va
= dst
->gpu_address
+ offset
;
44 if (sctx
->chip_class
== GFX6
) {
45 unreachable("SI DMA doesn't support the timestamp packet.");
49 /* Mark the buffer range of destination as valid (initialized),
50 * so that transfer_map knows it should wait for the GPU when mapping
52 util_range_add(&dst
->b
.b
, &dst
->valid_buffer_range
, offset
, offset
+ 8);
56 si_need_dma_space(sctx
, 4, dst
, NULL
);
57 si_dma_emit_wait_idle(sctx
);
60 cs
, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP
, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP
, 0));
62 radeon_emit(cs
, va
>> 32);
65 void si_sdma_clear_buffer(struct si_context
*sctx
, struct pipe_resource
*dst
, uint64_t offset
,
66 uint64_t size
, unsigned clear_value
)
68 struct radeon_cmdbuf
*cs
= sctx
->sdma_cs
;
69 unsigned i
, ncopy
, csize
;
70 struct si_resource
*sdst
= si_resource(dst
);
72 assert(offset
% 4 == 0);
74 assert(size
% 4 == 0);
76 if (!cs
|| dst
->flags
& PIPE_RESOURCE_FLAG_SPARSE
||
77 sctx
->screen
->debug_flags
& DBG(NO_SDMA_CLEARS
) || sctx
->ws
->ws_is_secure(sctx
->ws
)) {
78 sctx
->b
.clear_buffer(&sctx
->b
, dst
, offset
, size
, &clear_value
, 4);
82 /* Mark the buffer range of destination as valid (initialized),
83 * so that transfer_map knows it should wait for the GPU when mapping
85 util_range_add(dst
, &sdst
->valid_buffer_range
, offset
, offset
+ size
);
87 offset
+= sdst
->gpu_address
;
89 if (sctx
->chip_class
== GFX6
) {
90 /* the same maximum size as for copying */
91 ncopy
= DIV_ROUND_UP(size
, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE
);
92 si_need_dma_space(sctx
, ncopy
* 4, sdst
, NULL
);
94 for (i
= 0; i
< ncopy
; i
++) {
95 csize
= MIN2(size
, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE
);
96 radeon_emit(cs
, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL
, 0, csize
/ 4));
97 radeon_emit(cs
, offset
);
98 radeon_emit(cs
, clear_value
);
99 radeon_emit(cs
, (offset
>> 32) << 16);
106 /* The following code is for CI and later. */
107 /* the same maximum size as for copying */
108 unsigned max_size_per_packet
= sctx
->chip_class
>= GFX10_3
?
109 GFX103_SDMA_COPY_MAX_SIZE
:
110 CIK_SDMA_COPY_MAX_SIZE
;
111 ncopy
= DIV_ROUND_UP(size
, max_size_per_packet
);
112 si_need_dma_space(sctx
, ncopy
* 5, sdst
, NULL
);
114 for (i
= 0; i
< ncopy
; i
++) {
115 csize
= MIN2(size
, max_size_per_packet
);
116 radeon_emit(cs
, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL
, 0, 0x8000 /* dword copy */));
117 radeon_emit(cs
, offset
);
118 radeon_emit(cs
, offset
>> 32);
119 radeon_emit(cs
, clear_value
);
121 radeon_emit(cs
, (sctx
->chip_class
>= GFX9
? csize
- 1 : csize
) & 0xfffffffc);
127 void si_sdma_copy_buffer(struct si_context
*sctx
, struct pipe_resource
*dst
,
128 struct pipe_resource
*src
, uint64_t dst_offset
, uint64_t src_offset
,
131 struct radeon_cmdbuf
*cs
= sctx
->sdma_cs
;
132 unsigned i
, ncopy
, csize
;
133 struct si_resource
*sdst
= si_resource(dst
);
134 struct si_resource
*ssrc
= si_resource(src
);
136 if (!cs
|| dst
->flags
& PIPE_RESOURCE_FLAG_SPARSE
|| src
->flags
& PIPE_RESOURCE_FLAG_SPARSE
||
137 (ssrc
->flags
& RADEON_FLAG_ENCRYPTED
) != (sdst
->flags
& RADEON_FLAG_ENCRYPTED
)) {
138 si_copy_buffer(sctx
, dst
, src
, dst_offset
, src_offset
, size
);
142 /* Mark the buffer range of destination as valid (initialized),
143 * so that transfer_map knows it should wait for the GPU when mapping
145 util_range_add(dst
, &sdst
->valid_buffer_range
, dst_offset
, dst_offset
+ size
);
147 dst_offset
+= sdst
->gpu_address
;
148 src_offset
+= ssrc
->gpu_address
;
150 if (sctx
->chip_class
== GFX6
) {
151 unsigned max_size
, sub_cmd
, shift
;
153 /* see whether we should use the dword-aligned or byte-aligned copy */
154 if (!(dst_offset
% 4) && !(src_offset
% 4) && !(size
% 4)) {
155 sub_cmd
= SI_DMA_COPY_DWORD_ALIGNED
;
157 max_size
= SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE
;
159 sub_cmd
= SI_DMA_COPY_BYTE_ALIGNED
;
161 max_size
= SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE
;
164 ncopy
= DIV_ROUND_UP(size
, max_size
);
165 si_need_dma_space(sctx
, ncopy
* 5, sdst
, ssrc
);
167 for (i
= 0; i
< ncopy
; i
++) {
168 csize
= MIN2(size
, max_size
);
169 radeon_emit(cs
, SI_DMA_PACKET(SI_DMA_PACKET_COPY
, sub_cmd
, csize
>> shift
));
170 radeon_emit(cs
, dst_offset
);
171 radeon_emit(cs
, src_offset
);
172 radeon_emit(cs
, (dst_offset
>> 32UL) & 0xff);
173 radeon_emit(cs
, (src_offset
>> 32UL) & 0xff);
181 /* The following code is for CI and later. */
182 unsigned max_size_per_packet
= sctx
->chip_class
>= GFX10_3
?
183 GFX103_SDMA_COPY_MAX_SIZE
:
184 CIK_SDMA_COPY_MAX_SIZE
;
185 unsigned align
= ~0u;
186 ncopy
= DIV_ROUND_UP(size
, max_size_per_packet
);
188 /* Align copy size to dw if src/dst address are dw aligned */
189 if ((src_offset
& 0x3) == 0 && (dst_offset
& 0x3) == 0 && size
> 4 && (size
& 3) != 0) {
194 si_need_dma_space(sctx
, ncopy
* 7, sdst
, ssrc
);
196 for (i
= 0; i
< ncopy
; i
++) {
197 csize
= size
>= 4 ? MIN2(size
& align
, max_size_per_packet
) : size
;
198 radeon_emit(cs
, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY
, CIK_SDMA_COPY_SUB_OPCODE_LINEAR
,
199 (sctx
->ws
->cs_is_secure(cs
) ? 1u : 0) << 2));
200 radeon_emit(cs
, sctx
->chip_class
>= GFX9
? csize
- 1 : csize
);
201 radeon_emit(cs
, 0); /* src/dst endian swap */
202 radeon_emit(cs
, src_offset
);
203 radeon_emit(cs
, src_offset
>> 32);
204 radeon_emit(cs
, dst_offset
);
205 radeon_emit(cs
, dst_offset
>> 32);
212 void si_need_dma_space(struct si_context
*ctx
, unsigned num_dw
, struct si_resource
*dst
,
213 struct si_resource
*src
)
215 struct radeon_winsys
*ws
= ctx
->ws
;
216 uint64_t vram
= ctx
->sdma_cs
->used_vram
;
217 uint64_t gtt
= ctx
->sdma_cs
->used_gart
;
220 vram
+= dst
->vram_usage
;
221 gtt
+= dst
->gart_usage
;
224 vram
+= src
->vram_usage
;
225 gtt
+= src
->gart_usage
;
228 /* Flush the GFX IB if DMA depends on it. */
229 if (!ctx
->sdma_uploads_in_progress
&& radeon_emitted(ctx
->gfx_cs
, ctx
->initial_gfx_cs_size
) &&
230 ((dst
&& ws
->cs_is_buffer_referenced(ctx
->gfx_cs
, dst
->buf
, RADEON_USAGE_READWRITE
)) ||
231 (src
&& ws
->cs_is_buffer_referenced(ctx
->gfx_cs
, src
->buf
, RADEON_USAGE_WRITE
))))
232 si_flush_gfx_cs(ctx
, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW
, NULL
);
234 bool use_secure_cmd
= false;
235 /* if TMZ is supported and enabled */
236 if (ctx
->ws
->ws_is_secure(ctx
->ws
)) {
237 if (src
&& src
->flags
& RADEON_FLAG_ENCRYPTED
) {
238 assert(!dst
|| (dst
->flags
& RADEON_FLAG_ENCRYPTED
));
239 use_secure_cmd
= true;
240 } else if (dst
&& (dst
->flags
& RADEON_FLAG_ENCRYPTED
)) {
241 use_secure_cmd
= true;
245 /* Flush if there's not enough space, or if the memory usage per IB
248 * IBs using too little memory are limited by the IB submission overhead.
249 * IBs using too much memory are limited by the kernel/TTM overhead.
250 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
252 * This heuristic makes sure that DMA requests are executed
253 * very soon after the call is made and lowers memory usage.
254 * It improves texture upload performance by keeping the DMA
255 * engine busy while uploads are being submitted.
257 num_dw
++; /* for emit_wait_idle below */
258 if (!ctx
->sdma_uploads_in_progress
&&
259 (use_secure_cmd
!= ctx
->ws
->cs_is_secure(ctx
->sdma_cs
) ||
260 !ws
->cs_check_space(ctx
->sdma_cs
, num_dw
, false) ||
261 ctx
->sdma_cs
->used_vram
+ ctx
->sdma_cs
->used_gart
> 64 * 1024 * 1024 ||
262 !radeon_cs_memory_below_limit(ctx
->screen
, ctx
->sdma_cs
, vram
, gtt
))) {
263 si_flush_dma_cs(ctx
, PIPE_FLUSH_ASYNC
, NULL
);
264 assert((num_dw
+ ctx
->sdma_cs
->current
.cdw
) <= ctx
->sdma_cs
->current
.max_dw
);
266 ctx
->ws
->cs_set_secure(ctx
->sdma_cs
, use_secure_cmd
);
268 /* Wait for idle if either buffer has been used in the IB before to
269 * prevent read-after-write hazards.
271 if ((dst
&& ws
->cs_is_buffer_referenced(ctx
->sdma_cs
, dst
->buf
, RADEON_USAGE_READWRITE
)) ||
272 (src
&& ws
->cs_is_buffer_referenced(ctx
->sdma_cs
, src
->buf
, RADEON_USAGE_WRITE
)))
273 si_dma_emit_wait_idle(ctx
);
275 unsigned sync
= ctx
->sdma_uploads_in_progress
? 0 : RADEON_USAGE_SYNCHRONIZED
;
277 ws
->cs_add_buffer(ctx
->sdma_cs
, dst
->buf
, RADEON_USAGE_WRITE
| sync
, dst
->domains
, 0);
280 ws
->cs_add_buffer(ctx
->sdma_cs
, src
->buf
, RADEON_USAGE_READ
| sync
, src
->domains
, 0);
283 /* this function is called before all DMA calls, so increment this. */
284 ctx
->num_dma_calls
++;
287 void si_flush_dma_cs(struct si_context
*ctx
, unsigned flags
, struct pipe_fence_handle
**fence
)
289 struct radeon_cmdbuf
*cs
= ctx
->sdma_cs
;
290 struct radeon_saved_cs saved
;
291 bool check_vm
= (ctx
->screen
->debug_flags
& DBG(CHECK_VM
)) != 0;
293 if (!radeon_emitted(cs
, 0)) {
295 ctx
->ws
->fence_reference(fence
, ctx
->last_sdma_fence
);
300 si_save_cs(ctx
->ws
, cs
, &saved
, true);
302 ctx
->ws
->cs_flush(cs
, flags
, &ctx
->last_sdma_fence
);
304 ctx
->ws
->fence_reference(fence
, ctx
->last_sdma_fence
);
307 /* Use conservative timeout 800ms, after which we won't wait any
308 * longer and assume the GPU is hung.
310 ctx
->ws
->fence_wait(ctx
->ws
, ctx
->last_sdma_fence
, 800 * 1000 * 1000);
312 si_check_vm_faults(ctx
, &saved
, RING_DMA
);
313 si_clear_saved_cs(&saved
);
317 void si_screen_clear_buffer(struct si_screen
*sscreen
, struct pipe_resource
*dst
, uint64_t offset
,
318 uint64_t size
, unsigned value
)
320 struct si_context
*ctx
= (struct si_context
*)sscreen
->aux_context
;
322 simple_mtx_lock(&sscreen
->aux_context_lock
);
323 si_sdma_clear_buffer(ctx
, dst
, offset
, size
, value
);
324 sscreen
->aux_context
->flush(sscreen
->aux_context
, NULL
, 0);
325 simple_mtx_unlock(&sscreen
->aux_context_lock
);