2 * Copyright 2018 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
27 #include "util/format/u_format.h"
28 #include "util/format_srgb.h"
30 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
31 * and L2_STREAM for src.
33 static enum si_cache_policy
get_cache_policy(struct si_context
*sctx
,
34 enum si_coherency coher
,
37 if ((sctx
->chip_class
>= GFX9
&& (coher
== SI_COHERENCY_CB_META
||
38 coher
== SI_COHERENCY_CP
)) ||
39 (sctx
->chip_class
>= GFX7
&& coher
== SI_COHERENCY_SHADER
))
40 return size
<= 256 * 1024 ? L2_LRU
: L2_STREAM
;
45 unsigned si_get_flush_flags(struct si_context
*sctx
, enum si_coherency coher
,
46 enum si_cache_policy cache_policy
)
50 case SI_COHERENCY_NONE
:
53 case SI_COHERENCY_SHADER
:
54 return SI_CONTEXT_INV_SCACHE
|
55 SI_CONTEXT_INV_VCACHE
|
56 (cache_policy
== L2_BYPASS
? SI_CONTEXT_INV_L2
: 0);
57 case SI_COHERENCY_CB_META
:
58 return SI_CONTEXT_FLUSH_AND_INV_CB
;
62 static void si_launch_grid_internal(struct si_context
*sctx
,
63 struct pipe_grid_info
*info
)
65 /* Set settings for driver-internal compute dispatches. */
66 sctx
->flags
&= ~SI_CONTEXT_START_PIPELINE_STATS
;
67 sctx
->flags
|= SI_CONTEXT_STOP_PIPELINE_STATS
;
68 sctx
->render_cond_force_off
= true;
69 /* Skip decompression to prevent infinite recursion. */
70 sctx
->blitter
->running
= true;
72 /* Dispatch compute. */
73 sctx
->b
.launch_grid(&sctx
->b
, info
);
75 /* Restore default settings. */
76 sctx
->flags
&= ~SI_CONTEXT_STOP_PIPELINE_STATS
;
77 sctx
->flags
|= SI_CONTEXT_START_PIPELINE_STATS
;
78 sctx
->render_cond_force_off
= false;
79 sctx
->blitter
->running
= false;
82 static void si_compute_clear_12bytes_buffer(struct si_context
*sctx
,
83 struct pipe_resource
*dst
,
86 const uint32_t *clear_value
,
87 enum si_coherency coher
)
89 struct pipe_context
*ctx
= &sctx
->b
;
91 assert(dst_offset
% 4 == 0);
92 assert(size
% 4 == 0);
93 unsigned size_12
= DIV_ROUND_UP(size
, 12);
95 unsigned data
[4] = {0};
96 memcpy(data
, clear_value
, 12);
98 sctx
->flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
|
99 SI_CONTEXT_CS_PARTIAL_FLUSH
|
100 si_get_flush_flags(sctx
, coher
, SI_COMPUTE_DST_CACHE_POLICY
);
102 struct pipe_shader_buffer saved_sb
= {0};
103 si_get_shader_buffers(sctx
, PIPE_SHADER_COMPUTE
, 0, 1, &saved_sb
);
105 unsigned saved_writable_mask
= 0;
106 if (sctx
->const_and_shader_buffers
[PIPE_SHADER_COMPUTE
].writable_mask
&
107 (1u << si_get_shaderbuf_slot(0)))
108 saved_writable_mask
= 1;
110 struct pipe_constant_buffer saved_cb
= {};
111 si_get_pipe_constant_buffer(sctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
113 void *saved_cs
= sctx
->cs_shader_state
.program
;
115 struct pipe_constant_buffer cb
= {};
116 cb
.buffer_size
= sizeof(data
);
117 cb
.user_buffer
= data
;
118 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &cb
);
120 struct pipe_shader_buffer sb
= {0};
122 sb
.buffer_offset
= dst_offset
;
123 sb
.buffer_size
= size
;
125 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, &sb
, 0x1);
127 struct pipe_grid_info info
= {0};
129 if (!sctx
->cs_clear_12bytes_buffer
)
130 sctx
->cs_clear_12bytes_buffer
=
131 si_clear_12bytes_buffer_shader(ctx
);
132 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_12bytes_buffer
);
134 info
.last_block
[0] = size_12
% 64;
137 info
.grid
[0] = DIV_ROUND_UP(size_12
, 64);
141 si_launch_grid_internal(sctx
, &info
);
143 ctx
->bind_compute_state(ctx
, saved_cs
);
144 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, &saved_sb
, saved_writable_mask
);
145 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
147 pipe_resource_reference(&saved_sb
.buffer
, NULL
);
148 pipe_resource_reference(&saved_cb
.buffer
, NULL
);
151 static void si_compute_do_clear_or_copy(struct si_context
*sctx
,
152 struct pipe_resource
*dst
,
154 struct pipe_resource
*src
,
157 const uint32_t *clear_value
,
158 unsigned clear_value_size
,
159 enum si_coherency coher
)
161 struct pipe_context
*ctx
= &sctx
->b
;
163 assert(src_offset
% 4 == 0);
164 assert(dst_offset
% 4 == 0);
165 assert(size
% 4 == 0);
167 assert(dst
->target
!= PIPE_BUFFER
|| dst_offset
+ size
<= dst
->width0
);
168 assert(!src
|| src_offset
+ size
<= src
->width0
);
170 sctx
->flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
|
171 SI_CONTEXT_CS_PARTIAL_FLUSH
|
172 si_get_flush_flags(sctx
, coher
, SI_COMPUTE_DST_CACHE_POLICY
);
175 void *saved_cs
= sctx
->cs_shader_state
.program
;
176 struct pipe_shader_buffer saved_sb
[2] = {};
177 si_get_shader_buffers(sctx
, PIPE_SHADER_COMPUTE
, 0, src
? 2 : 1, saved_sb
);
179 unsigned saved_writable_mask
= 0;
180 for (unsigned i
= 0; i
< (src
? 2 : 1); i
++) {
181 if (sctx
->const_and_shader_buffers
[PIPE_SHADER_COMPUTE
].writable_mask
&
182 (1u << si_get_shaderbuf_slot(i
)))
183 saved_writable_mask
|= 1 << i
;
186 /* The memory accesses are coalesced, meaning that the 1st instruction writes
187 * the 1st contiguous block of data for the whole wave, the 2nd instruction
188 * writes the 2nd contiguous block of data, etc.
190 unsigned dwords_per_thread
= src
? SI_COMPUTE_COPY_DW_PER_THREAD
:
191 SI_COMPUTE_CLEAR_DW_PER_THREAD
;
192 unsigned instructions_per_thread
= MAX2(1, dwords_per_thread
/ 4);
193 unsigned dwords_per_instruction
= dwords_per_thread
/ instructions_per_thread
;
194 unsigned wave_size
= sctx
->screen
->compute_wave_size
;
195 unsigned dwords_per_wave
= dwords_per_thread
* wave_size
;
197 unsigned num_dwords
= size
/ 4;
198 unsigned num_instructions
= DIV_ROUND_UP(num_dwords
, dwords_per_instruction
);
200 struct pipe_grid_info info
= {};
201 info
.block
[0] = MIN2(wave_size
, num_instructions
);
204 info
.grid
[0] = DIV_ROUND_UP(num_dwords
, dwords_per_wave
);
208 struct pipe_shader_buffer sb
[2] = {};
210 sb
[0].buffer_offset
= dst_offset
;
211 sb
[0].buffer_size
= size
;
213 bool shader_dst_stream_policy
= SI_COMPUTE_DST_CACHE_POLICY
!= L2_LRU
;
217 sb
[1].buffer_offset
= src_offset
;
218 sb
[1].buffer_size
= size
;
220 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, sb
, 0x1);
222 if (!sctx
->cs_copy_buffer
) {
223 sctx
->cs_copy_buffer
= si_create_dma_compute_shader(&sctx
->b
,
224 SI_COMPUTE_COPY_DW_PER_THREAD
,
225 shader_dst_stream_policy
, true);
227 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_buffer
);
229 assert(clear_value_size
>= 4 &&
230 clear_value_size
<= 16 &&
231 util_is_power_of_two_or_zero(clear_value_size
));
233 for (unsigned i
= 0; i
< 4; i
++)
234 sctx
->cs_user_data
[i
] = clear_value
[i
% (clear_value_size
/ 4)];
236 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, sb
, 0x1);
238 if (!sctx
->cs_clear_buffer
) {
239 sctx
->cs_clear_buffer
= si_create_dma_compute_shader(&sctx
->b
,
240 SI_COMPUTE_CLEAR_DW_PER_THREAD
,
241 shader_dst_stream_policy
, false);
243 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_buffer
);
246 si_launch_grid_internal(sctx
, &info
);
248 enum si_cache_policy cache_policy
= get_cache_policy(sctx
, coher
, size
);
249 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
250 (cache_policy
== L2_BYPASS
? SI_CONTEXT_WB_L2
: 0);
252 if (cache_policy
!= L2_BYPASS
)
253 si_resource(dst
)->TC_L2_dirty
= true;
255 /* Restore states. */
256 ctx
->bind_compute_state(ctx
, saved_cs
);
257 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, src
? 2 : 1, saved_sb
,
258 saved_writable_mask
);
259 for (int i
= 0; i
< 2; i
++)
260 pipe_resource_reference(&saved_sb
[i
].buffer
, NULL
);
263 void si_clear_buffer(struct si_context
*sctx
, struct pipe_resource
*dst
,
264 uint64_t offset
, uint64_t size
, uint32_t *clear_value
,
265 uint32_t clear_value_size
, enum si_coherency coher
,
271 ASSERTED
unsigned clear_alignment
= MIN2(clear_value_size
, 4);
273 assert(clear_value_size
!= 3 && clear_value_size
!= 6); /* 12 is allowed. */
274 assert(offset
% clear_alignment
== 0);
275 assert(size
% clear_alignment
== 0);
276 assert(size
< (UINT_MAX
& ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
278 /* Reduce a large clear value size if possible. */
279 if (clear_value_size
> 4) {
280 bool clear_dword_duplicated
= true;
282 /* See if we can lower large fills to dword fills. */
283 for (unsigned i
= 1; i
< clear_value_size
/ 4; i
++) {
284 if (clear_value
[0] != clear_value
[i
]) {
285 clear_dword_duplicated
= false;
289 if (clear_dword_duplicated
)
290 clear_value_size
= 4;
293 /* Expand a small clear value size. */
294 uint32_t tmp_clear_value
;
295 if (clear_value_size
<= 2) {
296 if (clear_value_size
== 1) {
297 tmp_clear_value
= *(uint8_t*)clear_value
;
298 tmp_clear_value
|= (tmp_clear_value
<< 8) |
299 (tmp_clear_value
<< 16) |
300 (tmp_clear_value
<< 24);
302 tmp_clear_value
= *(uint16_t*)clear_value
;
303 tmp_clear_value
|= tmp_clear_value
<< 16;
305 clear_value
= &tmp_clear_value
;
306 clear_value_size
= 4;
309 if (clear_value_size
== 12) {
310 si_compute_clear_12bytes_buffer(sctx
, dst
, offset
, size
, clear_value
, coher
);
314 uint64_t aligned_size
= size
& ~3ull;
315 if (aligned_size
>= 4) {
316 /* Before GFX9, CP DMA was very slow when clearing GTT, so never
317 * use CP DMA clears on those chips, because we can't be certain
318 * about buffer placements.
320 if (clear_value_size
> 4 ||
322 clear_value_size
== 4 &&
324 (size
> 32*1024 || sctx
->chip_class
<= GFX9
))) {
325 si_compute_do_clear_or_copy(sctx
, dst
, offset
, NULL
, 0,
326 aligned_size
, clear_value
,
327 clear_value_size
, coher
);
329 assert(clear_value_size
== 4);
330 si_cp_dma_clear_buffer(sctx
, sctx
->gfx_cs
, dst
, offset
,
331 aligned_size
, *clear_value
, 0, coher
,
332 get_cache_policy(sctx
, coher
, size
));
335 offset
+= aligned_size
;
336 size
-= aligned_size
;
339 /* Handle non-dword alignment. */
342 assert(dst
->target
== PIPE_BUFFER
);
345 pipe_buffer_write(&sctx
->b
, dst
, offset
, size
, clear_value
);
349 static void si_pipe_clear_buffer(struct pipe_context
*ctx
,
350 struct pipe_resource
*dst
,
351 unsigned offset
, unsigned size
,
352 const void *clear_value
,
353 int clear_value_size
)
355 si_clear_buffer((struct si_context
*)ctx
, dst
, offset
, size
, (uint32_t*)clear_value
,
356 clear_value_size
, SI_COHERENCY_SHADER
, false);
359 void si_copy_buffer(struct si_context
*sctx
,
360 struct pipe_resource
*dst
, struct pipe_resource
*src
,
361 uint64_t dst_offset
, uint64_t src_offset
, unsigned size
)
366 enum si_coherency coher
= SI_COHERENCY_SHADER
;
367 enum si_cache_policy cache_policy
= get_cache_policy(sctx
, coher
, size
);
369 /* Only use compute for VRAM copies on dGPUs. */
370 if (sctx
->screen
->info
.has_dedicated_vram
&&
371 si_resource(dst
)->domains
& RADEON_DOMAIN_VRAM
&&
372 si_resource(src
)->domains
& RADEON_DOMAIN_VRAM
&&
374 dst_offset
% 4 == 0 && src_offset
% 4 == 0 && size
% 4 == 0) {
375 si_compute_do_clear_or_copy(sctx
, dst
, dst_offset
, src
, src_offset
,
376 size
, NULL
, 0, coher
);
378 si_cp_dma_copy_buffer(sctx
, dst
, src
, dst_offset
, src_offset
, size
,
379 0, coher
, cache_policy
);
383 void si_compute_copy_image(struct si_context
*sctx
,
384 struct pipe_resource
*dst
,
386 struct pipe_resource
*src
,
388 unsigned dstx
, unsigned dsty
, unsigned dstz
,
389 const struct pipe_box
*src_box
)
391 struct pipe_context
*ctx
= &sctx
->b
;
392 unsigned width
= src_box
->width
;
393 unsigned height
= src_box
->height
;
394 unsigned depth
= src_box
->depth
;
395 enum pipe_format src_format
= util_format_linear(src
->format
);
396 enum pipe_format dst_format
= util_format_linear(dst
->format
);
398 assert(util_format_is_subsampled_422(src_format
) ==
399 util_format_is_subsampled_422(dst_format
));
401 if (util_format_is_subsampled_422(src_format
)) {
402 src_format
= dst_format
= PIPE_FORMAT_R32_UINT
;
403 /* Interpreting 422 subsampled format (16 bpp) as 32 bpp
404 * should force us to divide src_box->x, dstx and width by 2.
405 * But given that ac_surface allocates this format as 32 bpp
406 * and that surf_size is then modified to pack the values
407 * we must keep the original values to get the correct results.
410 unsigned data
[] = {src_box
->x
, src_box
->y
, src_box
->z
, 0,
411 dstx
, dsty
, dstz
, 0};
413 if (width
== 0 || height
== 0)
416 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
417 si_get_flush_flags(sctx
, SI_COHERENCY_SHADER
, L2_STREAM
);
419 /* The driver doesn't decompress resources automatically here. */
420 si_decompress_subresource(ctx
, dst
, PIPE_MASK_RGBAZS
, dst_level
,
421 dstz
, dstz
+ src_box
->depth
- 1);
422 si_decompress_subresource(ctx
, src
, PIPE_MASK_RGBAZS
, src_level
,
423 src_box
->z
, src_box
->z
+ src_box
->depth
- 1);
425 /* src and dst have the same number of samples. */
426 si_make_CB_shader_coherent(sctx
, src
->nr_samples
, true,
427 /* Only src can have DCC.*/
428 ((struct si_texture
*)src
)->surface
.u
.gfx9
.dcc
.pipe_aligned
);
430 struct pipe_constant_buffer saved_cb
= {};
431 si_get_pipe_constant_buffer(sctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
433 struct si_images
*images
= &sctx
->images
[PIPE_SHADER_COMPUTE
];
434 struct pipe_image_view saved_image
[2] = {0};
435 util_copy_image_view(&saved_image
[0], &images
->views
[0]);
436 util_copy_image_view(&saved_image
[1], &images
->views
[1]);
438 void *saved_cs
= sctx
->cs_shader_state
.program
;
440 struct pipe_constant_buffer cb
= {};
441 cb
.buffer_size
= sizeof(data
);
442 cb
.user_buffer
= data
;
443 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &cb
);
445 struct pipe_image_view image
[2] = {0};
446 image
[0].resource
= src
;
447 image
[0].shader_access
= image
[0].access
= PIPE_IMAGE_ACCESS_READ
;
448 image
[0].format
= src_format
;
449 image
[0].u
.tex
.level
= src_level
;
450 image
[0].u
.tex
.first_layer
= 0;
451 image
[0].u
.tex
.last_layer
=
452 src
->target
== PIPE_TEXTURE_3D
? u_minify(src
->depth0
, src_level
) - 1
453 : (unsigned)(src
->array_size
- 1);
454 image
[1].resource
= dst
;
455 image
[1].shader_access
= image
[1].access
= PIPE_IMAGE_ACCESS_WRITE
;
456 image
[1].format
= dst_format
;
457 image
[1].u
.tex
.level
= dst_level
;
458 image
[1].u
.tex
.first_layer
= 0;
459 image
[1].u
.tex
.last_layer
=
460 dst
->target
== PIPE_TEXTURE_3D
? u_minify(dst
->depth0
, dst_level
) - 1
461 : (unsigned)(dst
->array_size
- 1);
463 if (src
->format
== PIPE_FORMAT_R9G9B9E5_FLOAT
)
464 image
[0].format
= image
[1].format
= PIPE_FORMAT_R32_UINT
;
466 /* SNORM8 blitting has precision issues on some chips. Use the SINT
467 * equivalent instead, which doesn't force DCC decompression.
468 * Note that some chips avoid this issue by using SDMA.
470 if (util_format_is_snorm8(dst
->format
)) {
471 image
[0].format
= image
[1].format
=
472 util_format_snorm8_to_sint8(dst
->format
);
475 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, image
);
477 struct pipe_grid_info info
= {0};
479 if (dst
->target
== PIPE_TEXTURE_1D_ARRAY
&& src
->target
== PIPE_TEXTURE_1D_ARRAY
) {
480 if (!sctx
->cs_copy_image_1d_array
)
481 sctx
->cs_copy_image_1d_array
=
482 si_create_copy_image_compute_shader_1d_array(ctx
);
483 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_image_1d_array
);
485 info
.last_block
[0] = width
% 64;
488 info
.grid
[0] = DIV_ROUND_UP(width
, 64);
489 info
.grid
[1] = depth
;
492 if (!sctx
->cs_copy_image
)
493 sctx
->cs_copy_image
= si_create_copy_image_compute_shader(ctx
);
494 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_image
);
496 info
.last_block
[0] = width
% 8;
498 info
.last_block
[1] = height
% 8;
500 info
.grid
[0] = DIV_ROUND_UP(width
, 8);
501 info
.grid
[1] = DIV_ROUND_UP(height
, 8);
502 info
.grid
[2] = depth
;
505 si_launch_grid_internal(sctx
, &info
);
507 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
508 (sctx
->chip_class
<= GFX8
? SI_CONTEXT_WB_L2
: 0) |
509 si_get_flush_flags(sctx
, SI_COHERENCY_SHADER
, L2_STREAM
);
510 ctx
->bind_compute_state(ctx
, saved_cs
);
511 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, saved_image
);
512 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
513 for (int i
= 0; i
< 2; i
++)
514 pipe_resource_reference(&saved_image
[i
].resource
, NULL
);
515 pipe_resource_reference(&saved_cb
.buffer
, NULL
);
518 void si_retile_dcc(struct si_context
*sctx
, struct si_texture
*tex
)
520 struct pipe_context
*ctx
= &sctx
->b
;
522 sctx
->flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
|
523 SI_CONTEXT_CS_PARTIAL_FLUSH
|
524 si_get_flush_flags(sctx
, SI_COHERENCY_CB_META
, L2_LRU
) |
525 si_get_flush_flags(sctx
, SI_COHERENCY_SHADER
, L2_LRU
);
526 sctx
->emit_cache_flush(sctx
);
529 void *saved_cs
= sctx
->cs_shader_state
.program
;
530 struct pipe_image_view saved_img
[3] = {};
532 for (unsigned i
= 0; i
< 3; i
++) {
533 util_copy_image_view(&saved_img
[i
],
534 &sctx
->images
[PIPE_SHADER_COMPUTE
].views
[i
]);
538 bool use_uint16
= tex
->surface
.u
.gfx9
.dcc_retile_use_uint16
;
539 unsigned num_elements
= tex
->surface
.u
.gfx9
.dcc_retile_num_elements
;
540 struct pipe_image_view img
[3];
542 assert(tex
->surface
.dcc_retile_map_offset
&& tex
->surface
.dcc_retile_map_offset
<= UINT_MAX
);
543 assert(tex
->surface
.dcc_offset
&& tex
->surface
.dcc_offset
<= UINT_MAX
);
544 assert(tex
->surface
.display_dcc_offset
&& tex
->surface
.display_dcc_offset
<= UINT_MAX
);
546 for (unsigned i
= 0; i
< 3; i
++) {
547 img
[i
].resource
= &tex
->buffer
.b
.b
;
548 img
[i
].access
= i
== 2 ? PIPE_IMAGE_ACCESS_WRITE
: PIPE_IMAGE_ACCESS_READ
;
549 img
[i
].shader_access
= SI_IMAGE_ACCESS_AS_BUFFER
;
552 img
[0].format
= use_uint16
? PIPE_FORMAT_R16G16B16A16_UINT
:
553 PIPE_FORMAT_R32G32B32A32_UINT
;
554 img
[0].u
.buf
.offset
= tex
->surface
.dcc_retile_map_offset
;
555 img
[0].u
.buf
.size
= num_elements
* (use_uint16
? 2 : 4);
557 img
[1].format
= PIPE_FORMAT_R8_UINT
;
558 img
[1].u
.buf
.offset
= tex
->surface
.dcc_offset
;
559 img
[1].u
.buf
.size
= tex
->surface
.dcc_size
;
561 img
[2].format
= PIPE_FORMAT_R8_UINT
;
562 img
[2].u
.buf
.offset
= tex
->surface
.display_dcc_offset
;
563 img
[2].u
.buf
.size
= tex
->surface
.u
.gfx9
.display_dcc_size
;
565 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 3, img
);
567 /* Bind the compute shader. */
568 if (!sctx
->cs_dcc_retile
)
569 sctx
->cs_dcc_retile
= si_create_dcc_retile_cs(ctx
);
570 ctx
->bind_compute_state(ctx
, sctx
->cs_dcc_retile
);
572 /* Dispatch compute. */
573 /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */
574 unsigned num_threads
= num_elements
/ 4;
576 struct pipe_grid_info info
= {};
580 info
.grid
[0] = DIV_ROUND_UP(num_threads
, 64); /* includes the partial block */
583 info
.last_block
[0] = num_threads
% 64;
585 si_launch_grid_internal(sctx
, &info
);
587 /* Don't flush caches or wait. The driver will wait at the end of this IB,
588 * and L2 will be flushed by the kernel fence.
591 /* Restore states. */
592 ctx
->bind_compute_state(ctx
, saved_cs
);
593 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 3, saved_img
);
595 for (unsigned i
= 0; i
< 3; i
++) {
596 pipe_resource_reference(&saved_img
[i
].resource
, NULL
);
600 /* Expand FMASK to make it identity, so that image stores can ignore it. */
601 void si_compute_expand_fmask(struct pipe_context
*ctx
, struct pipe_resource
*tex
)
603 struct si_context
*sctx
= (struct si_context
*)ctx
;
604 bool is_array
= tex
->target
== PIPE_TEXTURE_2D_ARRAY
;
605 unsigned log_fragments
= util_logbase2(tex
->nr_storage_samples
);
606 unsigned log_samples
= util_logbase2(tex
->nr_samples
);
607 assert(tex
->nr_samples
>= 2);
609 /* EQAA FMASK expansion is unimplemented. */
610 if (tex
->nr_samples
!= tex
->nr_storage_samples
)
613 /* Flush caches and sync engines. */
614 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
615 si_get_flush_flags(sctx
, SI_COHERENCY_SHADER
, L2_STREAM
);
616 si_make_CB_shader_coherent(sctx
, tex
->nr_samples
, true,
617 true /* DCC is not possible with image stores */);
620 void *saved_cs
= sctx
->cs_shader_state
.program
;
621 struct pipe_image_view saved_image
= {0};
622 util_copy_image_view(&saved_image
, &sctx
->images
[PIPE_SHADER_COMPUTE
].views
[0]);
624 /* Bind the image. */
625 struct pipe_image_view image
= {0};
626 image
.resource
= tex
;
627 /* Don't set WRITE so as not to trigger FMASK expansion, causing
628 * an infinite loop. */
629 image
.shader_access
= image
.access
= PIPE_IMAGE_ACCESS_READ
;
630 image
.format
= util_format_linear(tex
->format
);
632 image
.u
.tex
.last_layer
= tex
->array_size
- 1;
634 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, &image
);
636 /* Bind the shader. */
637 void **shader
= &sctx
->cs_fmask_expand
[log_samples
- 1][is_array
];
639 *shader
= si_create_fmask_expand_cs(ctx
, tex
->nr_samples
, is_array
);
640 ctx
->bind_compute_state(ctx
, *shader
);
642 /* Dispatch compute. */
643 struct pipe_grid_info info
= {0};
645 info
.last_block
[0] = tex
->width0
% 8;
647 info
.last_block
[1] = tex
->height0
% 8;
649 info
.grid
[0] = DIV_ROUND_UP(tex
->width0
, 8);
650 info
.grid
[1] = DIV_ROUND_UP(tex
->height0
, 8);
651 info
.grid
[2] = is_array
? tex
->array_size
: 1;
653 si_launch_grid_internal(sctx
, &info
);
655 /* Flush caches and sync engines. */
656 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
657 (sctx
->chip_class
<= GFX8
? SI_CONTEXT_WB_L2
: 0) |
658 si_get_flush_flags(sctx
, SI_COHERENCY_SHADER
, L2_STREAM
);
660 /* Restore previous states. */
661 ctx
->bind_compute_state(ctx
, saved_cs
);
662 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, &saved_image
);
663 pipe_resource_reference(&saved_image
.resource
, NULL
);
665 /* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */
666 #define INVALID 0 /* never used */
667 static const uint64_t fmask_expand_values
[][4] = {
669 /* 2 (8 bpp) 4 (8 bpp) 8 (8-32bpp) 16 (16-64bpp) fragments */
670 {0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE}, /* 1 */
671 {0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4}, /* 2 */
672 {INVALID
, 0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */
673 {INVALID
, INVALID
, 0x76543210, 0x8888888876543210}, /* 8 */
676 /* Clear FMASK to identity. */
677 struct si_texture
*stex
= (struct si_texture
*)tex
;
678 si_clear_buffer(sctx
, tex
, stex
->surface
.fmask_offset
, stex
->surface
.fmask_size
,
679 (uint32_t*)&fmask_expand_values
[log_fragments
][log_samples
- 1],
680 4, SI_COHERENCY_SHADER
, false);
683 void si_init_compute_blit_functions(struct si_context
*sctx
)
685 sctx
->b
.clear_buffer
= si_pipe_clear_buffer
;
688 /* Clear a region of a color surface to a constant value. */
689 void si_compute_clear_render_target(struct pipe_context
*ctx
,
690 struct pipe_surface
*dstsurf
,
691 const union pipe_color_union
*color
,
692 unsigned dstx
, unsigned dsty
,
693 unsigned width
, unsigned height
,
694 bool render_condition_enabled
)
696 struct si_context
*sctx
= (struct si_context
*)ctx
;
697 unsigned num_layers
= dstsurf
->u
.tex
.last_layer
- dstsurf
->u
.tex
.first_layer
+ 1;
698 unsigned data
[4 + sizeof(color
->ui
)] = {dstx
, dsty
, dstsurf
->u
.tex
.first_layer
, 0};
700 if (width
== 0 || height
== 0)
703 /* The driver doesn't decompress resources automatically here. */
704 si_decompress_subresource(ctx
, dstsurf
->texture
, PIPE_MASK_RGBA
,
705 dstsurf
->u
.tex
.level
, dstsurf
->u
.tex
.first_layer
,
706 dstsurf
->u
.tex
.last_layer
);
708 if (util_format_is_srgb(dstsurf
->format
)) {
709 union pipe_color_union color_srgb
;
710 for (int i
= 0; i
< 3; i
++)
711 color_srgb
.f
[i
] = util_format_linear_to_srgb_float(color
->f
[i
]);
712 color_srgb
.f
[3] = color
->f
[3];
713 memcpy(data
+ 4, color_srgb
.ui
, sizeof(color
->ui
));
715 memcpy(data
+ 4, color
->ui
, sizeof(color
->ui
));
718 sctx
->render_cond_force_off
= !render_condition_enabled
;
720 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
721 si_get_flush_flags(sctx
, SI_COHERENCY_SHADER
, L2_STREAM
);
722 si_make_CB_shader_coherent(sctx
, dstsurf
->texture
->nr_samples
, true,
723 true /* DCC is not possible with image stores */);
725 struct pipe_constant_buffer saved_cb
= {};
726 si_get_pipe_constant_buffer(sctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
728 struct si_images
*images
= &sctx
->images
[PIPE_SHADER_COMPUTE
];
729 struct pipe_image_view saved_image
= {0};
730 util_copy_image_view(&saved_image
, &images
->views
[0]);
732 void *saved_cs
= sctx
->cs_shader_state
.program
;
734 struct pipe_constant_buffer cb
= {};
735 cb
.buffer_size
= sizeof(data
);
736 cb
.user_buffer
= data
;
737 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &cb
);
739 struct pipe_image_view image
= {0};
740 image
.resource
= dstsurf
->texture
;
741 image
.shader_access
= image
.access
= PIPE_IMAGE_ACCESS_WRITE
;
742 image
.format
= util_format_linear(dstsurf
->format
);
743 image
.u
.tex
.level
= dstsurf
->u
.tex
.level
;
744 image
.u
.tex
.first_layer
= 0; /* 3D images ignore first_layer (BASE_ARRAY) */
745 image
.u
.tex
.last_layer
= dstsurf
->u
.tex
.last_layer
;
747 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, &image
);
749 struct pipe_grid_info info
= {0};
751 if (dstsurf
->texture
->target
!= PIPE_TEXTURE_1D_ARRAY
) {
752 if (!sctx
->cs_clear_render_target
)
753 sctx
->cs_clear_render_target
= si_clear_render_target_shader(ctx
);
754 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_render_target
);
756 info
.last_block
[0] = width
% 8;
758 info
.last_block
[1] = height
% 8;
760 info
.grid
[0] = DIV_ROUND_UP(width
, 8);
761 info
.grid
[1] = DIV_ROUND_UP(height
, 8);
762 info
.grid
[2] = num_layers
;
764 if (!sctx
->cs_clear_render_target_1d_array
)
765 sctx
->cs_clear_render_target_1d_array
=
766 si_clear_render_target_shader_1d_array(ctx
);
767 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_render_target_1d_array
);
769 info
.last_block
[0] = width
% 64;
772 info
.grid
[0] = DIV_ROUND_UP(width
, 64);
773 info
.grid
[1] = num_layers
;
777 si_launch_grid_internal(sctx
, &info
);
779 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
780 (sctx
->chip_class
<= GFX8
? SI_CONTEXT_WB_L2
: 0) |
781 si_get_flush_flags(sctx
, SI_COHERENCY_SHADER
, L2_STREAM
);
782 ctx
->bind_compute_state(ctx
, saved_cs
);
783 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, &saved_image
);
784 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
785 pipe_resource_reference(&saved_image
.resource
, NULL
);
786 pipe_resource_reference(&saved_cb
.buffer
, NULL
);