2 * Copyright 2018 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
27 #include "util/u_format.h"
29 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
30 * and L2_STREAM for src.
32 static enum si_cache_policy
get_cache_policy(struct si_context
*sctx
,
33 enum si_coherency coher
,
36 if ((sctx
->chip_class
>= GFX9
&& (coher
== SI_COHERENCY_CB_META
||
37 coher
== SI_COHERENCY_CP
)) ||
38 (sctx
->chip_class
>= CIK
&& coher
== SI_COHERENCY_SHADER
))
39 return size
<= 256 * 1024 ? L2_LRU
: L2_STREAM
;
44 unsigned si_get_flush_flags(struct si_context
*sctx
, enum si_coherency coher
,
45 enum si_cache_policy cache_policy
)
49 case SI_COHERENCY_NONE
:
52 case SI_COHERENCY_SHADER
:
53 return SI_CONTEXT_INV_SMEM_L1
|
54 SI_CONTEXT_INV_VMEM_L1
|
55 (cache_policy
== L2_BYPASS
? SI_CONTEXT_INV_GLOBAL_L2
: 0);
56 case SI_COHERENCY_CB_META
:
57 return SI_CONTEXT_FLUSH_AND_INV_CB
;
61 static void si_compute_internal_begin(struct si_context
*sctx
)
63 sctx
->flags
&= ~SI_CONTEXT_START_PIPELINE_STATS
;
64 sctx
->flags
|= SI_CONTEXT_STOP_PIPELINE_STATS
;
65 sctx
->render_cond_force_off
= true;
68 static void si_compute_internal_end(struct si_context
*sctx
)
70 sctx
->flags
&= ~SI_CONTEXT_STOP_PIPELINE_STATS
;
71 sctx
->flags
|= SI_CONTEXT_START_PIPELINE_STATS
;
72 sctx
->render_cond_force_off
= false;
75 static void si_compute_do_clear_or_copy(struct si_context
*sctx
,
76 struct pipe_resource
*dst
,
78 struct pipe_resource
*src
,
81 const uint32_t *clear_value
,
82 unsigned clear_value_size
,
83 enum si_coherency coher
)
85 struct pipe_context
*ctx
= &sctx
->b
;
87 assert(src_offset
% 4 == 0);
88 assert(dst_offset
% 4 == 0);
89 assert(size
% 4 == 0);
91 assert(dst
->target
!= PIPE_BUFFER
|| dst_offset
+ size
<= dst
->width0
);
92 assert(!src
|| src_offset
+ size
<= src
->width0
);
94 si_compute_internal_begin(sctx
);
95 sctx
->flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
|
96 SI_CONTEXT_CS_PARTIAL_FLUSH
|
97 si_get_flush_flags(sctx
, coher
, SI_COMPUTE_DST_CACHE_POLICY
);
100 void *saved_cs
= sctx
->cs_shader_state
.program
;
101 struct pipe_shader_buffer saved_sb
[2] = {};
102 si_get_shader_buffers(sctx
, PIPE_SHADER_COMPUTE
, 0, src
? 2 : 1, saved_sb
);
104 /* The memory accesses are coalesced, meaning that the 1st instruction writes
105 * the 1st contiguous block of data for the whole wave, the 2nd instruction
106 * writes the 2nd contiguous block of data, etc.
108 unsigned dwords_per_thread
= src
? SI_COMPUTE_COPY_DW_PER_THREAD
:
109 SI_COMPUTE_CLEAR_DW_PER_THREAD
;
110 unsigned instructions_per_thread
= MAX2(1, dwords_per_thread
/ 4);
111 unsigned dwords_per_instruction
= dwords_per_thread
/ instructions_per_thread
;
112 unsigned dwords_per_wave
= dwords_per_thread
* 64;
114 unsigned num_dwords
= size
/ 4;
115 unsigned num_instructions
= DIV_ROUND_UP(num_dwords
, dwords_per_instruction
);
117 struct pipe_grid_info info
= {};
118 info
.block
[0] = MIN2(64, num_instructions
);
121 info
.grid
[0] = DIV_ROUND_UP(num_dwords
, dwords_per_wave
);
125 struct pipe_shader_buffer sb
[2] = {};
127 sb
[0].buffer_offset
= dst_offset
;
128 sb
[0].buffer_size
= size
;
130 bool shader_dst_stream_policy
= SI_COMPUTE_DST_CACHE_POLICY
!= L2_LRU
;
134 sb
[1].buffer_offset
= src_offset
;
135 sb
[1].buffer_size
= size
;
137 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, sb
);
139 if (!sctx
->cs_copy_buffer
) {
140 sctx
->cs_copy_buffer
= si_create_dma_compute_shader(&sctx
->b
,
141 SI_COMPUTE_COPY_DW_PER_THREAD
,
142 shader_dst_stream_policy
, true);
144 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_buffer
);
146 assert(clear_value_size
>= 4 &&
147 clear_value_size
<= 16 &&
148 util_is_power_of_two_or_zero(clear_value_size
));
150 for (unsigned i
= 0; i
< 4; i
++)
151 sctx
->cs_user_data
[i
] = clear_value
[i
% (clear_value_size
/ 4)];
153 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, sb
);
155 if (!sctx
->cs_clear_buffer
) {
156 sctx
->cs_clear_buffer
= si_create_dma_compute_shader(&sctx
->b
,
157 SI_COMPUTE_CLEAR_DW_PER_THREAD
,
158 shader_dst_stream_policy
, false);
160 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_buffer
);
163 ctx
->launch_grid(ctx
, &info
);
165 enum si_cache_policy cache_policy
= get_cache_policy(sctx
, coher
, size
);
166 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
167 (cache_policy
== L2_BYPASS
? SI_CONTEXT_WRITEBACK_GLOBAL_L2
: 0);
169 if (cache_policy
!= L2_BYPASS
)
170 r600_resource(dst
)->TC_L2_dirty
= true;
172 /* Restore states. */
173 ctx
->bind_compute_state(ctx
, saved_cs
);
174 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, src
? 2 : 1, saved_sb
);
175 si_compute_internal_end(sctx
);
178 void si_clear_buffer(struct si_context
*sctx
, struct pipe_resource
*dst
,
179 uint64_t offset
, uint64_t size
, uint32_t *clear_value
,
180 uint32_t clear_value_size
, enum si_coherency coher
)
185 unsigned clear_alignment
= MIN2(clear_value_size
, 4);
187 assert(clear_value_size
!= 3 && clear_value_size
!= 6); /* 12 is allowed. */
188 assert(offset
% clear_alignment
== 0);
189 assert(size
% clear_alignment
== 0);
190 assert(size
< (UINT_MAX
& ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
192 /* Reduce a large clear value size if possible. */
193 if (clear_value_size
> 4) {
194 bool clear_dword_duplicated
= true;
196 /* See if we can lower large fills to dword fills. */
197 for (unsigned i
= 1; i
< clear_value_size
/ 4; i
++) {
198 if (clear_value
[0] != clear_value
[i
]) {
199 clear_dword_duplicated
= false;
203 if (clear_dword_duplicated
)
204 clear_value_size
= 4;
207 /* Expand a small clear value size. */
208 uint32_t tmp_clear_value
;
209 if (clear_value_size
<= 2) {
210 if (clear_value_size
== 1) {
211 tmp_clear_value
= *(uint8_t*)clear_value
;
212 tmp_clear_value
|= (tmp_clear_value
<< 8) |
213 (tmp_clear_value
<< 16) |
214 (tmp_clear_value
<< 24);
216 tmp_clear_value
= *(uint16_t*)clear_value
;
217 tmp_clear_value
|= tmp_clear_value
<< 16;
219 clear_value
= &tmp_clear_value
;
220 clear_value_size
= 4;
223 /* Use transform feedback for 12-byte clears. */
224 /* TODO: Use compute. */
225 if (clear_value_size
== 12) {
226 union pipe_color_union streamout_clear_value
;
228 memcpy(&streamout_clear_value
, clear_value
, clear_value_size
);
229 si_blitter_begin(sctx
, SI_DISABLE_RENDER_COND
);
230 util_blitter_clear_buffer(sctx
->blitter
, dst
, offset
,
231 size
, clear_value_size
/ 4,
232 &streamout_clear_value
);
233 si_blitter_end(sctx
);
237 uint64_t aligned_size
= size
& ~3ull;
238 if (aligned_size
>= 4) {
239 /* Before GFX9, CP DMA was very slow when clearing GTT, so never
240 * use CP DMA clears on those chips, because we can't be certain
241 * about buffer placements.
243 if (clear_value_size
> 4 ||
244 (clear_value_size
== 4 &&
246 (size
> 32*1024 || sctx
->chip_class
<= VI
))) {
247 si_compute_do_clear_or_copy(sctx
, dst
, offset
, NULL
, 0,
248 aligned_size
, clear_value
,
249 clear_value_size
, coher
);
251 assert(clear_value_size
== 4);
252 si_cp_dma_clear_buffer(sctx
, sctx
->gfx_cs
, dst
, offset
,
253 aligned_size
, *clear_value
, 0, coher
,
254 get_cache_policy(sctx
, coher
, size
));
257 offset
+= aligned_size
;
258 size
-= aligned_size
;
261 /* Handle non-dword alignment. */
264 assert(dst
->target
== PIPE_BUFFER
);
267 pipe_buffer_write(&sctx
->b
, dst
, offset
, size
, clear_value
);
271 static void si_pipe_clear_buffer(struct pipe_context
*ctx
,
272 struct pipe_resource
*dst
,
273 unsigned offset
, unsigned size
,
274 const void *clear_value
,
275 int clear_value_size
)
277 enum si_coherency coher
;
279 if (dst
->flags
& SI_RESOURCE_FLAG_SO_FILLED_SIZE
)
280 coher
= SI_COHERENCY_CP
;
282 coher
= SI_COHERENCY_SHADER
;
284 si_clear_buffer((struct si_context
*)ctx
, dst
, offset
, size
, (uint32_t*)clear_value
,
285 clear_value_size
, coher
);
288 void si_copy_buffer(struct si_context
*sctx
,
289 struct pipe_resource
*dst
, struct pipe_resource
*src
,
290 uint64_t dst_offset
, uint64_t src_offset
, unsigned size
)
295 enum si_coherency coher
= SI_COHERENCY_SHADER
;
296 enum si_cache_policy cache_policy
= get_cache_policy(sctx
, coher
, size
);
298 /* Only use compute for VRAM copies on dGPUs. */
299 if (sctx
->screen
->info
.has_dedicated_vram
&&
300 r600_resource(dst
)->domains
& RADEON_DOMAIN_VRAM
&&
301 r600_resource(src
)->domains
& RADEON_DOMAIN_VRAM
&&
303 dst_offset
% 4 == 0 && src_offset
% 4 == 0 && size
% 4 == 0) {
304 si_compute_do_clear_or_copy(sctx
, dst
, dst_offset
, src
, src_offset
,
305 size
, NULL
, 0, coher
);
307 si_cp_dma_copy_buffer(sctx
, dst
, src
, dst_offset
, src_offset
, size
,
308 0, coher
, cache_policy
);
312 void si_compute_copy_image(struct si_context
*sctx
,
313 struct pipe_resource
*dst
,
315 struct pipe_resource
*src
,
317 unsigned dstx
, unsigned dsty
, unsigned dstz
,
318 const struct pipe_box
*src_box
)
320 struct pipe_context
*ctx
= &sctx
->b
;
321 unsigned width
= src_box
->width
;
322 unsigned height
= src_box
->height
;
323 unsigned depth
= src_box
->depth
;
325 unsigned data
[] = {src_box
->x
, src_box
->y
, src_box
->z
, 0, dstx
, dsty
, dstz
, 0};
327 if (width
== 0 || height
== 0)
330 si_compute_internal_begin(sctx
);
331 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
332 si_get_flush_flags(sctx
, SI_COHERENCY_SHADER
, L2_STREAM
);
333 si_make_CB_shader_coherent(sctx
, dst
->nr_samples
, true);
335 struct pipe_constant_buffer saved_cb
= {};
336 si_get_pipe_constant_buffer(sctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
338 struct si_images
*images
= &sctx
->images
[PIPE_SHADER_COMPUTE
];
339 struct pipe_image_view saved_image
[2] = {0};
340 util_copy_image_view(&saved_image
[0], &images
->views
[0]);
341 util_copy_image_view(&saved_image
[1], &images
->views
[1]);
343 void *saved_cs
= sctx
->cs_shader_state
.program
;
345 struct pipe_constant_buffer cb
= {};
346 cb
.buffer_size
= sizeof(data
);
347 cb
.user_buffer
= data
;
348 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &cb
);
350 struct pipe_image_view image
[2] = {0};
351 image
[0].resource
= src
;
352 image
[0].shader_access
= image
[0].access
= PIPE_IMAGE_ACCESS_READ
;
353 image
[0].format
= util_format_linear(src
->format
);
354 image
[0].u
.tex
.level
= src_level
;
355 image
[0].u
.tex
.first_layer
= 0;
356 image
[0].u
.tex
.last_layer
=
357 src
->target
== PIPE_TEXTURE_3D
? u_minify(src
->depth0
, src_level
) - 1
358 : (unsigned)(src
->array_size
- 1);
359 image
[1].resource
= dst
;
360 image
[1].shader_access
= image
[1].access
= PIPE_IMAGE_ACCESS_WRITE
;
361 image
[1].format
= util_format_linear(dst
->format
);
362 image
[1].u
.tex
.level
= dst_level
;
363 image
[1].u
.tex
.first_layer
= 0;
364 image
[1].u
.tex
.last_layer
=
365 dst
->target
== PIPE_TEXTURE_3D
? u_minify(dst
->depth0
, dst_level
) - 1
366 : (unsigned)(dst
->array_size
- 1);
368 if (src
->format
== PIPE_FORMAT_R9G9B9E5_FLOAT
)
369 image
[0].format
= image
[1].format
= PIPE_FORMAT_R32_UINT
;
371 /* SNORM8 blitting has precision issues on some chips. Use the SINT
372 * equivalent instead, which doesn't force DCC decompression.
373 * Note that some chips avoid this issue by using SDMA.
375 if (util_format_is_snorm8(dst
->format
)) {
376 image
[0].format
= image
[1].format
=
377 util_format_snorm8_to_sint8(dst
->format
);
380 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, image
);
382 struct pipe_grid_info info
= {0};
384 if (dst
->target
== PIPE_TEXTURE_1D_ARRAY
&& src
->target
== PIPE_TEXTURE_1D_ARRAY
) {
385 if (!sctx
->cs_copy_image_1d_array
)
386 sctx
->cs_copy_image_1d_array
=
387 si_create_copy_image_compute_shader_1d_array(ctx
);
388 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_image_1d_array
);
390 sctx
->compute_last_block
[0] = width
% 64;
393 info
.grid
[0] = DIV_ROUND_UP(width
, 64);
394 info
.grid
[1] = depth
;
397 if (!sctx
->cs_copy_image
)
398 sctx
->cs_copy_image
= si_create_copy_image_compute_shader(ctx
);
399 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_image
);
401 sctx
->compute_last_block
[0] = width
% 8;
403 sctx
->compute_last_block
[1] = height
% 8;
405 info
.grid
[0] = DIV_ROUND_UP(width
, 8);
406 info
.grid
[1] = DIV_ROUND_UP(height
, 8);
407 info
.grid
[2] = depth
;
410 ctx
->launch_grid(ctx
, &info
);
412 sctx
->compute_last_block
[0] = 0;
413 sctx
->compute_last_block
[1] = 0;
415 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
416 (sctx
->chip_class
<= VI
? SI_CONTEXT_WRITEBACK_GLOBAL_L2
: 0) |
417 si_get_flush_flags(sctx
, SI_COHERENCY_SHADER
, L2_STREAM
);
418 ctx
->bind_compute_state(ctx
, saved_cs
);
419 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, saved_image
);
420 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
421 si_compute_internal_end(sctx
);
424 void si_init_compute_blit_functions(struct si_context
*sctx
)
426 sctx
->b
.clear_buffer
= si_pipe_clear_buffer
;