2 * Copyright 2018 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
27 #include "util/format/u_format.h"
28 #include "util/format_srgb.h"
30 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
31 * and L2_STREAM for src.
33 static enum si_cache_policy
get_cache_policy(struct si_context
*sctx
, enum si_coherency coher
,
36 if ((sctx
->chip_class
>= GFX9
&& (coher
== SI_COHERENCY_CB_META
||
37 coher
== SI_COHERENCY_DB_META
||
38 coher
== SI_COHERENCY_CP
)) ||
39 (sctx
->chip_class
>= GFX7
&& coher
== SI_COHERENCY_SHADER
))
40 return size
<= 256 * 1024 ? L2_LRU
: L2_STREAM
;
45 unsigned si_get_flush_flags(struct si_context
*sctx
, enum si_coherency coher
,
46 enum si_cache_policy cache_policy
)
50 case SI_COHERENCY_NONE
:
53 case SI_COHERENCY_SHADER
:
54 return SI_CONTEXT_INV_SCACHE
| SI_CONTEXT_INV_VCACHE
|
55 (cache_policy
== L2_BYPASS
? SI_CONTEXT_INV_L2
: 0);
56 case SI_COHERENCY_CB_META
:
57 return SI_CONTEXT_FLUSH_AND_INV_CB
;
58 case SI_COHERENCY_DB_META
:
59 return SI_CONTEXT_FLUSH_AND_INV_DB
;
63 #define SI_CS_IMAGE_OP (1 << 0)
64 #define SI_CS_WAIT_FOR_IDLE (1 << 1)
65 #define SI_CS_RENDER_COND_ENABLE (1 << 2)
67 static void si_launch_grid_internal(struct si_context
*sctx
, struct pipe_grid_info
*info
,
68 void *restore_cs
, unsigned flags
)
70 /* Wait for previous shaders to finish. */
71 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
| SI_CONTEXT_PS_PARTIAL_FLUSH
;
72 /* Invalidate L0-L1 caches. */
73 /* sL0 is never invalidated, because src resources don't use it. */
74 sctx
->flags
|= SI_CONTEXT_INV_VCACHE
;
76 /* Set settings for driver-internal compute dispatches. */
77 sctx
->flags
&= ~SI_CONTEXT_START_PIPELINE_STATS
;
78 sctx
->flags
|= SI_CONTEXT_STOP_PIPELINE_STATS
;
80 if (!(flags
& SI_CS_RENDER_COND_ENABLE
))
81 sctx
->render_cond_force_off
= true;
83 /* Skip decompression to prevent infinite recursion. */
85 sctx
->blitter
->running
= true;
87 /* Dispatch compute. */
88 sctx
->b
.launch_grid(&sctx
->b
, info
);
90 /* Restore default settings. */
91 sctx
->flags
&= ~SI_CONTEXT_STOP_PIPELINE_STATS
;
92 sctx
->flags
|= SI_CONTEXT_START_PIPELINE_STATS
;
93 sctx
->render_cond_force_off
= false;
95 sctx
->blitter
->running
= false;
97 /* Restore the original compute shader. */
98 sctx
->b
.bind_compute_state(&sctx
->b
, restore_cs
);
100 if (flags
& SI_CS_WAIT_FOR_IDLE
) {
101 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
;
103 if (flags
& SI_CS_IMAGE_OP
) {
104 /* Make sure image stores are visible to CB, which doesn't use L2 on GFX6-8. */
105 sctx
->flags
|= sctx
->chip_class
<= GFX8
? SI_CONTEXT_WB_L2
: 0;
106 /* Make sure image stores are visible to all CUs. */
107 sctx
->flags
|= SI_CONTEXT_INV_VCACHE
;
109 /* Make sure buffer stores are visible to all CUs. */
110 sctx
->flags
|= SI_CONTEXT_INV_SCACHE
| SI_CONTEXT_INV_VCACHE
;
113 assert(!(flags
& SI_CS_IMAGE_OP
));
117 static void si_compute_clear_12bytes_buffer(struct si_context
*sctx
, struct pipe_resource
*dst
,
118 unsigned dst_offset
, unsigned size
,
119 const uint32_t *clear_value
, enum si_coherency coher
)
121 struct pipe_context
*ctx
= &sctx
->b
;
123 assert(dst_offset
% 4 == 0);
124 assert(size
% 4 == 0);
125 unsigned size_12
= DIV_ROUND_UP(size
, 12);
127 unsigned data
[4] = {0};
128 memcpy(data
, clear_value
, 12);
130 sctx
->flags
|= si_get_flush_flags(sctx
, coher
, SI_COMPUTE_DST_CACHE_POLICY
);
132 struct pipe_shader_buffer saved_sb
= {0};
133 si_get_shader_buffers(sctx
, PIPE_SHADER_COMPUTE
, 0, 1, &saved_sb
);
135 unsigned saved_writable_mask
= 0;
136 if (sctx
->const_and_shader_buffers
[PIPE_SHADER_COMPUTE
].writable_mask
&
137 (1u << si_get_shaderbuf_slot(0)))
138 saved_writable_mask
= 1;
140 struct pipe_constant_buffer saved_cb
= {};
141 si_get_pipe_constant_buffer(sctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
143 void *saved_cs
= sctx
->cs_shader_state
.program
;
145 struct pipe_constant_buffer cb
= {};
146 cb
.buffer_size
= sizeof(data
);
147 cb
.user_buffer
= data
;
148 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &cb
);
150 struct pipe_shader_buffer sb
= {0};
152 sb
.buffer_offset
= dst_offset
;
153 sb
.buffer_size
= size
;
155 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, &sb
, 0x1);
157 struct pipe_grid_info info
= {0};
159 if (!sctx
->cs_clear_12bytes_buffer
)
160 sctx
->cs_clear_12bytes_buffer
= si_clear_12bytes_buffer_shader(ctx
);
161 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_12bytes_buffer
);
163 info
.last_block
[0] = size_12
% 64;
166 info
.grid
[0] = DIV_ROUND_UP(size_12
, 64);
170 si_launch_grid_internal(sctx
, &info
, saved_cs
, SI_CS_WAIT_FOR_IDLE
);
172 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, &saved_sb
, saved_writable_mask
);
173 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
175 pipe_resource_reference(&saved_sb
.buffer
, NULL
);
176 pipe_resource_reference(&saved_cb
.buffer
, NULL
);
179 static void si_compute_do_clear_or_copy(struct si_context
*sctx
, struct pipe_resource
*dst
,
180 unsigned dst_offset
, struct pipe_resource
*src
,
181 unsigned src_offset
, unsigned size
,
182 const uint32_t *clear_value
, unsigned clear_value_size
,
183 enum si_coherency coher
)
185 struct pipe_context
*ctx
= &sctx
->b
;
187 assert(src_offset
% 4 == 0);
188 assert(dst_offset
% 4 == 0);
189 assert(size
% 4 == 0);
191 assert(dst
->target
!= PIPE_BUFFER
|| dst_offset
+ size
<= dst
->width0
);
192 assert(!src
|| src_offset
+ size
<= src
->width0
);
194 sctx
->flags
|= si_get_flush_flags(sctx
, coher
, SI_COMPUTE_DST_CACHE_POLICY
);
197 void *saved_cs
= sctx
->cs_shader_state
.program
;
198 struct pipe_shader_buffer saved_sb
[2] = {};
199 si_get_shader_buffers(sctx
, PIPE_SHADER_COMPUTE
, 0, src
? 2 : 1, saved_sb
);
201 unsigned saved_writable_mask
= 0;
202 for (unsigned i
= 0; i
< (src
? 2 : 1); i
++) {
203 if (sctx
->const_and_shader_buffers
[PIPE_SHADER_COMPUTE
].writable_mask
&
204 (1u << si_get_shaderbuf_slot(i
)))
205 saved_writable_mask
|= 1 << i
;
208 /* The memory accesses are coalesced, meaning that the 1st instruction writes
209 * the 1st contiguous block of data for the whole wave, the 2nd instruction
210 * writes the 2nd contiguous block of data, etc.
212 unsigned dwords_per_thread
=
213 src
? SI_COMPUTE_COPY_DW_PER_THREAD
: SI_COMPUTE_CLEAR_DW_PER_THREAD
;
214 unsigned instructions_per_thread
= MAX2(1, dwords_per_thread
/ 4);
215 unsigned dwords_per_instruction
= dwords_per_thread
/ instructions_per_thread
;
216 unsigned wave_size
= sctx
->screen
->compute_wave_size
;
217 unsigned dwords_per_wave
= dwords_per_thread
* wave_size
;
219 unsigned num_dwords
= size
/ 4;
220 unsigned num_instructions
= DIV_ROUND_UP(num_dwords
, dwords_per_instruction
);
222 struct pipe_grid_info info
= {};
223 info
.block
[0] = MIN2(wave_size
, num_instructions
);
226 info
.grid
[0] = DIV_ROUND_UP(num_dwords
, dwords_per_wave
);
230 struct pipe_shader_buffer sb
[2] = {};
232 sb
[0].buffer_offset
= dst_offset
;
233 sb
[0].buffer_size
= size
;
235 bool shader_dst_stream_policy
= SI_COMPUTE_DST_CACHE_POLICY
!= L2_LRU
;
239 sb
[1].buffer_offset
= src_offset
;
240 sb
[1].buffer_size
= size
;
242 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, sb
, 0x1);
244 if (!sctx
->cs_copy_buffer
) {
245 sctx
->cs_copy_buffer
= si_create_dma_compute_shader(
246 &sctx
->b
, SI_COMPUTE_COPY_DW_PER_THREAD
, shader_dst_stream_policy
, true);
248 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_buffer
);
250 assert(clear_value_size
>= 4 && clear_value_size
<= 16 &&
251 util_is_power_of_two_or_zero(clear_value_size
));
253 for (unsigned i
= 0; i
< 4; i
++)
254 sctx
->cs_user_data
[i
] = clear_value
[i
% (clear_value_size
/ 4)];
256 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, sb
, 0x1);
258 if (!sctx
->cs_clear_buffer
) {
259 sctx
->cs_clear_buffer
= si_create_dma_compute_shader(
260 &sctx
->b
, SI_COMPUTE_CLEAR_DW_PER_THREAD
, shader_dst_stream_policy
, false);
262 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_buffer
);
265 si_launch_grid_internal(sctx
, &info
, saved_cs
, SI_CS_WAIT_FOR_IDLE
);
267 enum si_cache_policy cache_policy
= get_cache_policy(sctx
, coher
, size
);
268 sctx
->flags
|= cache_policy
== L2_BYPASS
? SI_CONTEXT_WB_L2
: 0;
270 if (cache_policy
!= L2_BYPASS
)
271 si_resource(dst
)->TC_L2_dirty
= true;
273 /* Restore states. */
274 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, src
? 2 : 1, saved_sb
, saved_writable_mask
);
275 for (int i
= 0; i
< 2; i
++)
276 pipe_resource_reference(&saved_sb
[i
].buffer
, NULL
);
279 void si_clear_buffer(struct si_context
*sctx
, struct pipe_resource
*dst
, uint64_t offset
,
280 uint64_t size
, uint32_t *clear_value
, uint32_t clear_value_size
,
281 enum si_coherency coher
, bool force_cpdma
)
286 ASSERTED
unsigned clear_alignment
= MIN2(clear_value_size
, 4);
288 assert(clear_value_size
!= 3 && clear_value_size
!= 6); /* 12 is allowed. */
289 assert(offset
% clear_alignment
== 0);
290 assert(size
% clear_alignment
== 0);
291 assert(size
< (UINT_MAX
& ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
293 /* Reduce a large clear value size if possible. */
294 if (clear_value_size
> 4) {
295 bool clear_dword_duplicated
= true;
297 /* See if we can lower large fills to dword fills. */
298 for (unsigned i
= 1; i
< clear_value_size
/ 4; i
++) {
299 if (clear_value
[0] != clear_value
[i
]) {
300 clear_dword_duplicated
= false;
304 if (clear_dword_duplicated
)
305 clear_value_size
= 4;
308 /* Expand a small clear value size. */
309 uint32_t tmp_clear_value
;
310 if (clear_value_size
<= 2) {
311 if (clear_value_size
== 1) {
312 tmp_clear_value
= *(uint8_t *)clear_value
;
314 (tmp_clear_value
<< 8) | (tmp_clear_value
<< 16) | (tmp_clear_value
<< 24);
316 tmp_clear_value
= *(uint16_t *)clear_value
;
317 tmp_clear_value
|= tmp_clear_value
<< 16;
319 clear_value
= &tmp_clear_value
;
320 clear_value_size
= 4;
323 if (clear_value_size
== 12) {
324 si_compute_clear_12bytes_buffer(sctx
, dst
, offset
, size
, clear_value
, coher
);
328 uint64_t aligned_size
= size
& ~3ull;
329 if (aligned_size
>= 4) {
330 uint64_t compute_min_size
;
332 /* CP DMA clears are terribly slow with GTT on GFX6-8, which can always
333 * happen due to BO evictions.
335 if (sctx
->chip_class
<= GFX8
) {
336 compute_min_size
= 0;
337 } else if (sctx
->chip_class
>= GFX10
&&
338 sctx
->screen
->info
.has_dedicated_vram
&&
339 si_resource(dst
)->domains
& RADEON_DOMAIN_VRAM
) {
340 /* VRAM clears on gfx10 dGPUs */
341 if (sctx
->screen
->info
.vram_bit_width
>= 192)
342 compute_min_size
= 128 * 1024;
344 compute_min_size
= 1024 * 1024;
345 } else if (sctx
->screen
->info
.has_dedicated_vram
&&
346 si_resource(dst
)->domains
& RADEON_DOMAIN_GTT
) {
347 /* GTT clears on gfx9 and gfx10 dGPUs */
348 compute_min_size
= UINT64_MAX
; /* CP DMA is the best due to slow PCIe */
350 compute_min_size
= 32 * 1024;
353 if (clear_value_size
> 4 || (!force_cpdma
&& clear_value_size
== 4 && offset
% 4 == 0 &&
354 size
> compute_min_size
)) {
355 si_compute_do_clear_or_copy(sctx
, dst
, offset
, NULL
, 0, aligned_size
, clear_value
,
356 clear_value_size
, coher
);
358 assert(clear_value_size
== 4);
359 si_cp_dma_clear_buffer(sctx
, sctx
->gfx_cs
, dst
, offset
, aligned_size
, *clear_value
, 0,
360 coher
, get_cache_policy(sctx
, coher
, size
));
363 offset
+= aligned_size
;
364 size
-= aligned_size
;
367 /* Handle non-dword alignment. */
370 assert(dst
->target
== PIPE_BUFFER
);
373 pipe_buffer_write(&sctx
->b
, dst
, offset
, size
, clear_value
);
377 static void si_pipe_clear_buffer(struct pipe_context
*ctx
, struct pipe_resource
*dst
,
378 unsigned offset
, unsigned size
, const void *clear_value
,
379 int clear_value_size
)
381 si_clear_buffer((struct si_context
*)ctx
, dst
, offset
, size
, (uint32_t *)clear_value
,
382 clear_value_size
, SI_COHERENCY_SHADER
, false);
385 void si_copy_buffer(struct si_context
*sctx
, struct pipe_resource
*dst
, struct pipe_resource
*src
,
386 uint64_t dst_offset
, uint64_t src_offset
, unsigned size
)
391 enum si_coherency coher
= SI_COHERENCY_SHADER
;
392 enum si_cache_policy cache_policy
= get_cache_policy(sctx
, coher
, size
);
393 uint64_t compute_min_size
;
395 if (sctx
->chip_class
>= GFX10
&&
396 sctx
->screen
->info
.has_dedicated_vram
&&
397 si_resource(dst
)->domains
& RADEON_DOMAIN_VRAM
&&
398 si_resource(src
)->domains
& RADEON_DOMAIN_VRAM
) {
399 /* VRAM copies on gfx10 dGPUs */
400 if (sctx
->screen
->info
.vram_bit_width
>= 192)
401 compute_min_size
= 128 * 1024;
403 compute_min_size
= 1024 * 1024;
404 } else if (sctx
->chip_class
>= GFX10
&&
405 sctx
->screen
->info
.has_dedicated_vram
&&
406 (si_resource(dst
)->domains
| si_resource(src
)->domains
) & RADEON_DOMAIN_GTT
) {
407 compute_min_size
= UINT64_MAX
; /* CP DMA is the best due to slow PCIe */
409 compute_min_size
= 32 * 1024;
412 /* Only use compute for VRAM copies on dGPUs. */
413 if (sctx
->screen
->info
.has_dedicated_vram
&& si_resource(dst
)->domains
& RADEON_DOMAIN_VRAM
&&
414 si_resource(src
)->domains
& RADEON_DOMAIN_VRAM
&& size
> compute_min_size
&&
415 dst_offset
% 4 == 0 && src_offset
% 4 == 0 && size
% 4 == 0) {
416 si_compute_do_clear_or_copy(sctx
, dst
, dst_offset
, src
, src_offset
, size
, NULL
, 0, coher
);
418 si_cp_dma_copy_buffer(sctx
, dst
, src
, dst_offset
, src_offset
, size
, 0, coher
, cache_policy
);
422 void si_compute_copy_image(struct si_context
*sctx
, struct pipe_resource
*dst
, unsigned dst_level
,
423 struct pipe_resource
*src
, unsigned src_level
, unsigned dstx
,
424 unsigned dsty
, unsigned dstz
, const struct pipe_box
*src_box
,
425 bool is_dcc_decompress
)
427 struct pipe_context
*ctx
= &sctx
->b
;
428 unsigned width
= src_box
->width
;
429 unsigned height
= src_box
->height
;
430 unsigned depth
= src_box
->depth
;
431 enum pipe_format src_format
= util_format_linear(src
->format
);
432 enum pipe_format dst_format
= util_format_linear(dst
->format
);
433 bool is_linear
= ((struct si_texture
*)src
)->surface
.is_linear
||
434 ((struct si_texture
*)dst
)->surface
.is_linear
;
436 assert(util_format_is_subsampled_422(src_format
) == util_format_is_subsampled_422(dst_format
));
438 if (!vi_dcc_enabled((struct si_texture
*)src
, src_level
) &&
439 src_format
== dst_format
&&
440 util_format_is_float(src_format
) &&
441 !util_format_is_compressed(src_format
)) {
442 /* Interpret as integer values to avoid NaN issues */
443 switch(util_format_get_blocksizebits(src_format
)) {
445 src_format
= dst_format
= PIPE_FORMAT_R16_UINT
;
448 src_format
= dst_format
= PIPE_FORMAT_R32_UINT
;
451 src_format
= dst_format
= PIPE_FORMAT_R32G32_UINT
;
454 src_format
= dst_format
= PIPE_FORMAT_R32G32B32A32_UINT
;
461 if (util_format_is_subsampled_422(src_format
)) {
462 src_format
= dst_format
= PIPE_FORMAT_R32_UINT
;
463 /* Interpreting 422 subsampled format (16 bpp) as 32 bpp
464 * should force us to divide src_box->x, dstx and width by 2.
465 * But given that ac_surface allocates this format as 32 bpp
466 * and that surf_size is then modified to pack the values
467 * we must keep the original values to get the correct results.
471 if (width
== 0 || height
== 0)
474 /* The driver doesn't decompress resources automatically here. */
475 si_decompress_subresource(ctx
, dst
, PIPE_MASK_RGBAZS
, dst_level
, dstz
,
476 dstz
+ src_box
->depth
- 1);
477 si_decompress_subresource(ctx
, src
, PIPE_MASK_RGBAZS
, src_level
, src_box
->z
,
478 src_box
->z
+ src_box
->depth
- 1);
480 /* src and dst have the same number of samples. */
481 si_make_CB_shader_coherent(sctx
, src
->nr_samples
, true,
482 /* Only src can have DCC.*/
483 ((struct si_texture
*)src
)->surface
.u
.gfx9
.dcc
.pipe_aligned
);
485 struct pipe_constant_buffer saved_cb
= {};
487 struct si_images
*images
= &sctx
->images
[PIPE_SHADER_COMPUTE
];
488 struct pipe_image_view saved_image
[2] = {0};
489 util_copy_image_view(&saved_image
[0], &images
->views
[0]);
490 util_copy_image_view(&saved_image
[1], &images
->views
[1]);
492 void *saved_cs
= sctx
->cs_shader_state
.program
;
494 if (!is_dcc_decompress
) {
495 unsigned data
[] = {src_box
->x
, src_box
->y
, src_box
->z
, 0, dstx
, dsty
, dstz
, 0};
497 si_get_pipe_constant_buffer(sctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
499 struct pipe_constant_buffer cb
= {};
500 cb
.buffer_size
= sizeof(data
);
501 cb
.user_buffer
= data
;
502 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &cb
);
505 struct pipe_image_view image
[2] = {0};
506 image
[0].resource
= src
;
507 image
[0].shader_access
= image
[0].access
= PIPE_IMAGE_ACCESS_READ
;
508 image
[0].format
= src_format
;
509 image
[0].u
.tex
.level
= src_level
;
510 image
[0].u
.tex
.first_layer
= 0;
511 image
[0].u
.tex
.last_layer
= src
->target
== PIPE_TEXTURE_3D
? u_minify(src
->depth0
, src_level
) - 1
512 : (unsigned)(src
->array_size
- 1);
513 image
[1].resource
= dst
;
514 image
[1].shader_access
= image
[1].access
= PIPE_IMAGE_ACCESS_WRITE
;
515 image
[1].format
= dst_format
;
516 image
[1].u
.tex
.level
= dst_level
;
517 image
[1].u
.tex
.first_layer
= 0;
518 image
[1].u
.tex
.last_layer
= dst
->target
== PIPE_TEXTURE_3D
? u_minify(dst
->depth0
, dst_level
) - 1
519 : (unsigned)(dst
->array_size
- 1);
521 /* SNORM8 blitting has precision issues on some chips. Use the SINT
522 * equivalent instead, which doesn't force DCC decompression.
523 * Note that some chips avoid this issue by using SDMA.
525 if (util_format_is_snorm8(dst
->format
)) {
526 image
[0].format
= image
[1].format
= util_format_snorm8_to_sint8(dst
->format
);
529 if (is_dcc_decompress
)
530 image
[1].access
|= SI_IMAGE_ACCESS_DCC_OFF
;
532 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, image
);
534 struct pipe_grid_info info
= {0};
536 if (is_dcc_decompress
) {
537 /* The DCC decompression is a normal blit where the load is compressed
538 * and the store is uncompressed. The workgroup size is either equal to
539 * the DCC block size or a multiple thereof. The shader uses a barrier
540 * between loads and stores to safely overwrite each DCC block of pixels.
542 struct si_texture
*tex
= (struct si_texture
*)src
;
543 unsigned dim
[3] = {src_box
->width
, src_box
->height
, src_box
->depth
};
546 assert(dst
->target
!= PIPE_TEXTURE_1D
&& dst
->target
!= PIPE_TEXTURE_1D_ARRAY
);
548 if (!sctx
->cs_dcc_decompress
)
549 sctx
->cs_dcc_decompress
= si_create_dcc_decompress_cs(ctx
);
550 ctx
->bind_compute_state(ctx
, sctx
->cs_dcc_decompress
);
552 info
.block
[0] = tex
->surface
.u
.gfx9
.dcc_block_width
;
553 info
.block
[1] = tex
->surface
.u
.gfx9
.dcc_block_height
;
554 info
.block
[2] = tex
->surface
.u
.gfx9
.dcc_block_depth
;
556 /* Make sure the block size is at least the same as wave size. */
557 while (info
.block
[0] * info
.block
[1] * info
.block
[2] <
558 sctx
->screen
->compute_wave_size
) {
562 for (unsigned i
= 0; i
< 3; i
++) {
563 info
.last_block
[i
] = dim
[i
] % info
.block
[i
];
564 info
.grid
[i
] = DIV_ROUND_UP(dim
[i
], info
.block
[i
]);
566 } else if (dst
->target
== PIPE_TEXTURE_1D_ARRAY
&& src
->target
== PIPE_TEXTURE_1D_ARRAY
) {
567 if (!sctx
->cs_copy_image_1d_array
)
568 sctx
->cs_copy_image_1d_array
= si_create_copy_image_compute_shader_1d_array(ctx
);
569 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_image_1d_array
);
571 info
.last_block
[0] = width
% 64;
574 info
.grid
[0] = DIV_ROUND_UP(width
, 64);
575 info
.grid
[1] = depth
;
578 if (!sctx
->cs_copy_image
)
579 sctx
->cs_copy_image
= si_create_copy_image_compute_shader(ctx
);
580 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_image
);
582 /* This is better for access over PCIe. */
590 info
.last_block
[0] = width
% info
.block
[0];
591 info
.last_block
[1] = height
% info
.block
[1];
593 info
.grid
[0] = DIV_ROUND_UP(width
, info
.block
[0]);
594 info
.grid
[1] = DIV_ROUND_UP(height
, info
.block
[1]);
595 info
.grid
[2] = depth
;
598 si_launch_grid_internal(sctx
, &info
, saved_cs
,
599 SI_CS_WAIT_FOR_IDLE
| SI_CS_IMAGE_OP
);
601 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, saved_image
);
602 for (int i
= 0; i
< 2; i
++)
603 pipe_resource_reference(&saved_image
[i
].resource
, NULL
);
604 if (!is_dcc_decompress
) {
605 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
606 pipe_resource_reference(&saved_cb
.buffer
, NULL
);
610 void si_retile_dcc(struct si_context
*sctx
, struct si_texture
*tex
)
612 struct pipe_context
*ctx
= &sctx
->b
;
614 sctx
->flags
|= si_get_flush_flags(sctx
, SI_COHERENCY_CB_META
, L2_LRU
);
617 void *saved_cs
= sctx
->cs_shader_state
.program
;
618 struct pipe_image_view saved_img
[3] = {};
620 for (unsigned i
= 0; i
< 3; i
++) {
621 util_copy_image_view(&saved_img
[i
], &sctx
->images
[PIPE_SHADER_COMPUTE
].views
[i
]);
625 bool use_uint16
= tex
->surface
.u
.gfx9
.dcc_retile_use_uint16
;
626 unsigned num_elements
= tex
->surface
.u
.gfx9
.dcc_retile_num_elements
;
627 struct pipe_image_view img
[3];
629 assert(tex
->surface
.dcc_retile_map_offset
&& tex
->surface
.dcc_retile_map_offset
<= UINT_MAX
);
630 assert(tex
->surface
.dcc_offset
&& tex
->surface
.dcc_offset
<= UINT_MAX
);
631 assert(tex
->surface
.display_dcc_offset
&& tex
->surface
.display_dcc_offset
<= UINT_MAX
);
633 for (unsigned i
= 0; i
< 3; i
++) {
634 img
[i
].resource
= &tex
->buffer
.b
.b
;
635 img
[i
].access
= i
== 2 ? PIPE_IMAGE_ACCESS_WRITE
: PIPE_IMAGE_ACCESS_READ
;
636 img
[i
].shader_access
= SI_IMAGE_ACCESS_AS_BUFFER
;
639 img
[0].format
= use_uint16
? PIPE_FORMAT_R16G16B16A16_UINT
: PIPE_FORMAT_R32G32B32A32_UINT
;
640 img
[0].u
.buf
.offset
= tex
->surface
.dcc_retile_map_offset
;
641 img
[0].u
.buf
.size
= num_elements
* (use_uint16
? 2 : 4);
643 img
[1].format
= PIPE_FORMAT_R8_UINT
;
644 img
[1].u
.buf
.offset
= tex
->surface
.dcc_offset
;
645 img
[1].u
.buf
.size
= tex
->surface
.dcc_size
;
647 img
[2].format
= PIPE_FORMAT_R8_UINT
;
648 img
[2].u
.buf
.offset
= tex
->surface
.display_dcc_offset
;
649 img
[2].u
.buf
.size
= tex
->surface
.u
.gfx9
.display_dcc_size
;
651 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 3, img
);
653 /* Bind the compute shader. */
654 if (!sctx
->cs_dcc_retile
)
655 sctx
->cs_dcc_retile
= si_create_dcc_retile_cs(ctx
);
656 ctx
->bind_compute_state(ctx
, sctx
->cs_dcc_retile
);
658 /* Dispatch compute. */
659 /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */
660 unsigned num_threads
= num_elements
/ 4;
662 struct pipe_grid_info info
= {};
666 info
.grid
[0] = DIV_ROUND_UP(num_threads
, 64); /* includes the partial block */
669 info
.last_block
[0] = num_threads
% 64;
671 si_launch_grid_internal(sctx
, &info
, saved_cs
, 0);
673 /* Don't flush caches or wait. The driver will wait at the end of this IB,
674 * and L2 will be flushed by the kernel fence.
677 /* Restore states. */
678 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 3, saved_img
);
680 for (unsigned i
= 0; i
< 3; i
++) {
681 pipe_resource_reference(&saved_img
[i
].resource
, NULL
);
685 /* Expand FMASK to make it identity, so that image stores can ignore it. */
686 void si_compute_expand_fmask(struct pipe_context
*ctx
, struct pipe_resource
*tex
)
688 struct si_context
*sctx
= (struct si_context
*)ctx
;
689 bool is_array
= tex
->target
== PIPE_TEXTURE_2D_ARRAY
;
690 unsigned log_fragments
= util_logbase2(tex
->nr_storage_samples
);
691 unsigned log_samples
= util_logbase2(tex
->nr_samples
);
692 assert(tex
->nr_samples
>= 2);
694 /* EQAA FMASK expansion is unimplemented. */
695 if (tex
->nr_samples
!= tex
->nr_storage_samples
)
698 si_make_CB_shader_coherent(sctx
, tex
->nr_samples
, true,
699 true /* DCC is not possible with image stores */);
702 void *saved_cs
= sctx
->cs_shader_state
.program
;
703 struct pipe_image_view saved_image
= {0};
704 util_copy_image_view(&saved_image
, &sctx
->images
[PIPE_SHADER_COMPUTE
].views
[0]);
706 /* Bind the image. */
707 struct pipe_image_view image
= {0};
708 image
.resource
= tex
;
709 /* Don't set WRITE so as not to trigger FMASK expansion, causing
710 * an infinite loop. */
711 image
.shader_access
= image
.access
= PIPE_IMAGE_ACCESS_READ
;
712 image
.format
= util_format_linear(tex
->format
);
714 image
.u
.tex
.last_layer
= tex
->array_size
- 1;
716 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, &image
);
718 /* Bind the shader. */
719 void **shader
= &sctx
->cs_fmask_expand
[log_samples
- 1][is_array
];
721 *shader
= si_create_fmask_expand_cs(ctx
, tex
->nr_samples
, is_array
);
722 ctx
->bind_compute_state(ctx
, *shader
);
724 /* Dispatch compute. */
725 struct pipe_grid_info info
= {0};
727 info
.last_block
[0] = tex
->width0
% 8;
729 info
.last_block
[1] = tex
->height0
% 8;
731 info
.grid
[0] = DIV_ROUND_UP(tex
->width0
, 8);
732 info
.grid
[1] = DIV_ROUND_UP(tex
->height0
, 8);
733 info
.grid
[2] = is_array
? tex
->array_size
: 1;
735 si_launch_grid_internal(sctx
, &info
, saved_cs
,
736 SI_CS_WAIT_FOR_IDLE
| SI_CS_IMAGE_OP
);
738 /* Restore previous states. */
739 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, &saved_image
);
740 pipe_resource_reference(&saved_image
.resource
, NULL
);
742 /* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */
743 #define INVALID 0 /* never used */
744 static const uint64_t fmask_expand_values
[][4] = {
746 /* 2 (8 bpp) 4 (8 bpp) 8 (8-32bpp) 16 (16-64bpp) fragments */
747 {0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE}, /* 1 */
748 {0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4}, /* 2 */
749 {INVALID
, 0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */
750 {INVALID
, INVALID
, 0x76543210, 0x8888888876543210}, /* 8 */
753 /* Clear FMASK to identity. */
754 struct si_texture
*stex
= (struct si_texture
*)tex
;
755 si_clear_buffer(sctx
, tex
, stex
->surface
.fmask_offset
, stex
->surface
.fmask_size
,
756 (uint32_t *)&fmask_expand_values
[log_fragments
][log_samples
- 1],
757 log_fragments
>= 2 && log_samples
== 4 ? 8 : 4,
758 SI_COHERENCY_SHADER
, false);
761 void si_init_compute_blit_functions(struct si_context
*sctx
)
763 sctx
->b
.clear_buffer
= si_pipe_clear_buffer
;
766 /* Clear a region of a color surface to a constant value. */
767 void si_compute_clear_render_target(struct pipe_context
*ctx
, struct pipe_surface
*dstsurf
,
768 const union pipe_color_union
*color
, unsigned dstx
,
769 unsigned dsty
, unsigned width
, unsigned height
,
770 bool render_condition_enabled
)
772 struct si_context
*sctx
= (struct si_context
*)ctx
;
773 unsigned num_layers
= dstsurf
->u
.tex
.last_layer
- dstsurf
->u
.tex
.first_layer
+ 1;
774 unsigned data
[4 + sizeof(color
->ui
)] = {dstx
, dsty
, dstsurf
->u
.tex
.first_layer
, 0};
776 if (width
== 0 || height
== 0)
779 /* The driver doesn't decompress resources automatically here. */
780 si_decompress_subresource(ctx
, dstsurf
->texture
, PIPE_MASK_RGBA
, dstsurf
->u
.tex
.level
,
781 dstsurf
->u
.tex
.first_layer
, dstsurf
->u
.tex
.last_layer
);
783 if (util_format_is_srgb(dstsurf
->format
)) {
784 union pipe_color_union color_srgb
;
785 for (int i
= 0; i
< 3; i
++)
786 color_srgb
.f
[i
] = util_format_linear_to_srgb_float(color
->f
[i
]);
787 color_srgb
.f
[3] = color
->f
[3];
788 memcpy(data
+ 4, color_srgb
.ui
, sizeof(color
->ui
));
790 memcpy(data
+ 4, color
->ui
, sizeof(color
->ui
));
793 si_make_CB_shader_coherent(sctx
, dstsurf
->texture
->nr_samples
, true,
794 true /* DCC is not possible with image stores */);
796 struct pipe_constant_buffer saved_cb
= {};
797 si_get_pipe_constant_buffer(sctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
799 struct si_images
*images
= &sctx
->images
[PIPE_SHADER_COMPUTE
];
800 struct pipe_image_view saved_image
= {0};
801 util_copy_image_view(&saved_image
, &images
->views
[0]);
803 void *saved_cs
= sctx
->cs_shader_state
.program
;
805 struct pipe_constant_buffer cb
= {};
806 cb
.buffer_size
= sizeof(data
);
807 cb
.user_buffer
= data
;
808 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &cb
);
810 struct pipe_image_view image
= {0};
811 image
.resource
= dstsurf
->texture
;
812 image
.shader_access
= image
.access
= PIPE_IMAGE_ACCESS_WRITE
;
813 image
.format
= util_format_linear(dstsurf
->format
);
814 image
.u
.tex
.level
= dstsurf
->u
.tex
.level
;
815 image
.u
.tex
.first_layer
= 0; /* 3D images ignore first_layer (BASE_ARRAY) */
816 image
.u
.tex
.last_layer
= dstsurf
->u
.tex
.last_layer
;
818 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, &image
);
820 struct pipe_grid_info info
= {0};
822 if (dstsurf
->texture
->target
!= PIPE_TEXTURE_1D_ARRAY
) {
823 if (!sctx
->cs_clear_render_target
)
824 sctx
->cs_clear_render_target
= si_clear_render_target_shader(ctx
);
825 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_render_target
);
827 info
.last_block
[0] = width
% 8;
829 info
.last_block
[1] = height
% 8;
831 info
.grid
[0] = DIV_ROUND_UP(width
, 8);
832 info
.grid
[1] = DIV_ROUND_UP(height
, 8);
833 info
.grid
[2] = num_layers
;
835 if (!sctx
->cs_clear_render_target_1d_array
)
836 sctx
->cs_clear_render_target_1d_array
= si_clear_render_target_shader_1d_array(ctx
);
837 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_render_target_1d_array
);
839 info
.last_block
[0] = width
% 64;
842 info
.grid
[0] = DIV_ROUND_UP(width
, 64);
843 info
.grid
[1] = num_layers
;
847 si_launch_grid_internal(sctx
, &info
, saved_cs
,
848 SI_CS_WAIT_FOR_IDLE
| SI_CS_IMAGE_OP
|
849 (render_condition_enabled
? SI_CS_RENDER_COND_ENABLE
: 0));
851 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, &saved_image
);
852 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
853 pipe_resource_reference(&saved_image
.resource
, NULL
);
854 pipe_resource_reference(&saved_cb
.buffer
, NULL
);