2 * Copyright 2018 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
27 #include "util/u_format.h"
28 #include "util/format_srgb.h"
30 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
31 * and L2_STREAM for src.
33 static enum si_cache_policy
get_cache_policy(struct si_context
*sctx
,
34 enum si_coherency coher
,
37 if ((sctx
->chip_class
>= GFX9
&& (coher
== SI_COHERENCY_CB_META
||
38 coher
== SI_COHERENCY_CP
)) ||
39 (sctx
->chip_class
>= CIK
&& coher
== SI_COHERENCY_SHADER
))
40 return size
<= 256 * 1024 ? L2_LRU
: L2_STREAM
;
45 unsigned si_get_flush_flags(struct si_context
*sctx
, enum si_coherency coher
,
46 enum si_cache_policy cache_policy
)
50 case SI_COHERENCY_NONE
:
53 case SI_COHERENCY_SHADER
:
54 return SI_CONTEXT_INV_SMEM_L1
|
55 SI_CONTEXT_INV_VMEM_L1
|
56 (cache_policy
== L2_BYPASS
? SI_CONTEXT_INV_GLOBAL_L2
: 0);
57 case SI_COHERENCY_CB_META
:
58 return SI_CONTEXT_FLUSH_AND_INV_CB
;
62 static void si_compute_internal_begin(struct si_context
*sctx
)
64 sctx
->flags
&= ~SI_CONTEXT_START_PIPELINE_STATS
;
65 sctx
->flags
|= SI_CONTEXT_STOP_PIPELINE_STATS
;
66 sctx
->render_cond_force_off
= true;
69 static void si_compute_internal_end(struct si_context
*sctx
)
71 sctx
->flags
&= ~SI_CONTEXT_STOP_PIPELINE_STATS
;
72 sctx
->flags
|= SI_CONTEXT_START_PIPELINE_STATS
;
73 sctx
->render_cond_force_off
= false;
76 static void si_compute_do_clear_or_copy(struct si_context
*sctx
,
77 struct pipe_resource
*dst
,
79 struct pipe_resource
*src
,
82 const uint32_t *clear_value
,
83 unsigned clear_value_size
,
84 enum si_coherency coher
)
86 struct pipe_context
*ctx
= &sctx
->b
;
88 assert(src_offset
% 4 == 0);
89 assert(dst_offset
% 4 == 0);
90 assert(size
% 4 == 0);
92 assert(dst
->target
!= PIPE_BUFFER
|| dst_offset
+ size
<= dst
->width0
);
93 assert(!src
|| src_offset
+ size
<= src
->width0
);
95 si_compute_internal_begin(sctx
);
96 sctx
->flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
|
97 SI_CONTEXT_CS_PARTIAL_FLUSH
|
98 si_get_flush_flags(sctx
, coher
, SI_COMPUTE_DST_CACHE_POLICY
);
101 void *saved_cs
= sctx
->cs_shader_state
.program
;
102 struct pipe_shader_buffer saved_sb
[2] = {};
103 si_get_shader_buffers(sctx
, PIPE_SHADER_COMPUTE
, 0, src
? 2 : 1, saved_sb
);
105 unsigned saved_writable_mask
= 0;
106 for (unsigned i
= 0; i
< (src
? 2 : 1); i
++) {
107 if (sctx
->const_and_shader_buffers
[PIPE_SHADER_COMPUTE
].writable_mask
&
108 (1u << si_get_shaderbuf_slot(i
)))
109 saved_writable_mask
|= 1 << i
;
112 /* The memory accesses are coalesced, meaning that the 1st instruction writes
113 * the 1st contiguous block of data for the whole wave, the 2nd instruction
114 * writes the 2nd contiguous block of data, etc.
116 unsigned dwords_per_thread
= src
? SI_COMPUTE_COPY_DW_PER_THREAD
:
117 SI_COMPUTE_CLEAR_DW_PER_THREAD
;
118 unsigned instructions_per_thread
= MAX2(1, dwords_per_thread
/ 4);
119 unsigned dwords_per_instruction
= dwords_per_thread
/ instructions_per_thread
;
120 unsigned dwords_per_wave
= dwords_per_thread
* 64;
122 unsigned num_dwords
= size
/ 4;
123 unsigned num_instructions
= DIV_ROUND_UP(num_dwords
, dwords_per_instruction
);
125 struct pipe_grid_info info
= {};
126 info
.block
[0] = MIN2(64, num_instructions
);
129 info
.grid
[0] = DIV_ROUND_UP(num_dwords
, dwords_per_wave
);
133 struct pipe_shader_buffer sb
[2] = {};
135 sb
[0].buffer_offset
= dst_offset
;
136 sb
[0].buffer_size
= size
;
138 bool shader_dst_stream_policy
= SI_COMPUTE_DST_CACHE_POLICY
!= L2_LRU
;
142 sb
[1].buffer_offset
= src_offset
;
143 sb
[1].buffer_size
= size
;
145 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, sb
, 0x1);
147 if (!sctx
->cs_copy_buffer
) {
148 sctx
->cs_copy_buffer
= si_create_dma_compute_shader(&sctx
->b
,
149 SI_COMPUTE_COPY_DW_PER_THREAD
,
150 shader_dst_stream_policy
, true);
152 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_buffer
);
154 assert(clear_value_size
>= 4 &&
155 clear_value_size
<= 16 &&
156 util_is_power_of_two_or_zero(clear_value_size
));
158 for (unsigned i
= 0; i
< 4; i
++)
159 sctx
->cs_user_data
[i
] = clear_value
[i
% (clear_value_size
/ 4)];
161 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, sb
, 0x1);
163 if (!sctx
->cs_clear_buffer
) {
164 sctx
->cs_clear_buffer
= si_create_dma_compute_shader(&sctx
->b
,
165 SI_COMPUTE_CLEAR_DW_PER_THREAD
,
166 shader_dst_stream_policy
, false);
168 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_buffer
);
171 ctx
->launch_grid(ctx
, &info
);
173 enum si_cache_policy cache_policy
= get_cache_policy(sctx
, coher
, size
);
174 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
175 (cache_policy
== L2_BYPASS
? SI_CONTEXT_WRITEBACK_GLOBAL_L2
: 0);
177 if (cache_policy
!= L2_BYPASS
)
178 si_resource(dst
)->TC_L2_dirty
= true;
180 /* Restore states. */
181 ctx
->bind_compute_state(ctx
, saved_cs
);
182 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, src
? 2 : 1, saved_sb
,
183 saved_writable_mask
);
184 si_compute_internal_end(sctx
);
187 void si_clear_buffer(struct si_context
*sctx
, struct pipe_resource
*dst
,
188 uint64_t offset
, uint64_t size
, uint32_t *clear_value
,
189 uint32_t clear_value_size
, enum si_coherency coher
)
194 unsigned clear_alignment
= MIN2(clear_value_size
, 4);
196 assert(clear_value_size
!= 3 && clear_value_size
!= 6); /* 12 is allowed. */
197 assert(offset
% clear_alignment
== 0);
198 assert(size
% clear_alignment
== 0);
199 assert(size
< (UINT_MAX
& ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
201 /* Reduce a large clear value size if possible. */
202 if (clear_value_size
> 4) {
203 bool clear_dword_duplicated
= true;
205 /* See if we can lower large fills to dword fills. */
206 for (unsigned i
= 1; i
< clear_value_size
/ 4; i
++) {
207 if (clear_value
[0] != clear_value
[i
]) {
208 clear_dword_duplicated
= false;
212 if (clear_dword_duplicated
)
213 clear_value_size
= 4;
216 /* Expand a small clear value size. */
217 uint32_t tmp_clear_value
;
218 if (clear_value_size
<= 2) {
219 if (clear_value_size
== 1) {
220 tmp_clear_value
= *(uint8_t*)clear_value
;
221 tmp_clear_value
|= (tmp_clear_value
<< 8) |
222 (tmp_clear_value
<< 16) |
223 (tmp_clear_value
<< 24);
225 tmp_clear_value
= *(uint16_t*)clear_value
;
226 tmp_clear_value
|= tmp_clear_value
<< 16;
228 clear_value
= &tmp_clear_value
;
229 clear_value_size
= 4;
232 /* Use transform feedback for 12-byte clears. */
233 /* TODO: Use compute. */
234 if (clear_value_size
== 12) {
235 union pipe_color_union streamout_clear_value
;
237 memcpy(&streamout_clear_value
, clear_value
, clear_value_size
);
238 si_blitter_begin(sctx
, SI_DISABLE_RENDER_COND
);
239 util_blitter_clear_buffer(sctx
->blitter
, dst
, offset
,
240 size
, clear_value_size
/ 4,
241 &streamout_clear_value
);
242 si_blitter_end(sctx
);
246 uint64_t aligned_size
= size
& ~3ull;
247 if (aligned_size
>= 4) {
248 /* Before GFX9, CP DMA was very slow when clearing GTT, so never
249 * use CP DMA clears on those chips, because we can't be certain
250 * about buffer placements.
252 if (clear_value_size
> 4 ||
253 (clear_value_size
== 4 &&
255 (size
> 32*1024 || sctx
->chip_class
<= VI
))) {
256 si_compute_do_clear_or_copy(sctx
, dst
, offset
, NULL
, 0,
257 aligned_size
, clear_value
,
258 clear_value_size
, coher
);
260 assert(clear_value_size
== 4);
261 si_cp_dma_clear_buffer(sctx
, sctx
->gfx_cs
, dst
, offset
,
262 aligned_size
, *clear_value
, 0, coher
,
263 get_cache_policy(sctx
, coher
, size
));
266 offset
+= aligned_size
;
267 size
-= aligned_size
;
270 /* Handle non-dword alignment. */
273 assert(dst
->target
== PIPE_BUFFER
);
276 pipe_buffer_write(&sctx
->b
, dst
, offset
, size
, clear_value
);
280 static void si_pipe_clear_buffer(struct pipe_context
*ctx
,
281 struct pipe_resource
*dst
,
282 unsigned offset
, unsigned size
,
283 const void *clear_value
,
284 int clear_value_size
)
286 si_clear_buffer((struct si_context
*)ctx
, dst
, offset
, size
, (uint32_t*)clear_value
,
287 clear_value_size
, SI_COHERENCY_SHADER
);
290 void si_copy_buffer(struct si_context
*sctx
,
291 struct pipe_resource
*dst
, struct pipe_resource
*src
,
292 uint64_t dst_offset
, uint64_t src_offset
, unsigned size
)
297 enum si_coherency coher
= SI_COHERENCY_SHADER
;
298 enum si_cache_policy cache_policy
= get_cache_policy(sctx
, coher
, size
);
300 /* Only use compute for VRAM copies on dGPUs. */
301 if (sctx
->screen
->info
.has_dedicated_vram
&&
302 si_resource(dst
)->domains
& RADEON_DOMAIN_VRAM
&&
303 si_resource(src
)->domains
& RADEON_DOMAIN_VRAM
&&
305 dst_offset
% 4 == 0 && src_offset
% 4 == 0 && size
% 4 == 0) {
306 si_compute_do_clear_or_copy(sctx
, dst
, dst_offset
, src
, src_offset
,
307 size
, NULL
, 0, coher
);
309 si_cp_dma_copy_buffer(sctx
, dst
, src
, dst_offset
, src_offset
, size
,
310 0, coher
, cache_policy
);
314 void si_compute_copy_image(struct si_context
*sctx
,
315 struct pipe_resource
*dst
,
317 struct pipe_resource
*src
,
319 unsigned dstx
, unsigned dsty
, unsigned dstz
,
320 const struct pipe_box
*src_box
)
322 struct pipe_context
*ctx
= &sctx
->b
;
323 unsigned width
= src_box
->width
;
324 unsigned height
= src_box
->height
;
325 unsigned depth
= src_box
->depth
;
327 unsigned data
[] = {src_box
->x
, src_box
->y
, src_box
->z
, 0, dstx
, dsty
, dstz
, 0};
329 if (width
== 0 || height
== 0)
332 si_compute_internal_begin(sctx
);
333 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
334 si_get_flush_flags(sctx
, SI_COHERENCY_SHADER
, L2_STREAM
);
336 /* src and dst have the same number of samples. */
337 si_make_CB_shader_coherent(sctx
, src
->nr_samples
, true,
338 /* Only src can have DCC.*/
339 ((struct si_texture
*)src
)->surface
.u
.gfx9
.dcc
.pipe_aligned
);
341 struct pipe_constant_buffer saved_cb
= {};
342 si_get_pipe_constant_buffer(sctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
344 struct si_images
*images
= &sctx
->images
[PIPE_SHADER_COMPUTE
];
345 struct pipe_image_view saved_image
[2] = {0};
346 util_copy_image_view(&saved_image
[0], &images
->views
[0]);
347 util_copy_image_view(&saved_image
[1], &images
->views
[1]);
349 void *saved_cs
= sctx
->cs_shader_state
.program
;
351 struct pipe_constant_buffer cb
= {};
352 cb
.buffer_size
= sizeof(data
);
353 cb
.user_buffer
= data
;
354 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &cb
);
356 struct pipe_image_view image
[2] = {0};
357 image
[0].resource
= src
;
358 image
[0].shader_access
= image
[0].access
= PIPE_IMAGE_ACCESS_READ
;
359 image
[0].format
= util_format_linear(src
->format
);
360 image
[0].u
.tex
.level
= src_level
;
361 image
[0].u
.tex
.first_layer
= 0;
362 image
[0].u
.tex
.last_layer
=
363 src
->target
== PIPE_TEXTURE_3D
? u_minify(src
->depth0
, src_level
) - 1
364 : (unsigned)(src
->array_size
- 1);
365 image
[1].resource
= dst
;
366 image
[1].shader_access
= image
[1].access
= PIPE_IMAGE_ACCESS_WRITE
;
367 image
[1].format
= util_format_linear(dst
->format
);
368 image
[1].u
.tex
.level
= dst_level
;
369 image
[1].u
.tex
.first_layer
= 0;
370 image
[1].u
.tex
.last_layer
=
371 dst
->target
== PIPE_TEXTURE_3D
? u_minify(dst
->depth0
, dst_level
) - 1
372 : (unsigned)(dst
->array_size
- 1);
374 if (src
->format
== PIPE_FORMAT_R9G9B9E5_FLOAT
)
375 image
[0].format
= image
[1].format
= PIPE_FORMAT_R32_UINT
;
377 /* SNORM8 blitting has precision issues on some chips. Use the SINT
378 * equivalent instead, which doesn't force DCC decompression.
379 * Note that some chips avoid this issue by using SDMA.
381 if (util_format_is_snorm8(dst
->format
)) {
382 image
[0].format
= image
[1].format
=
383 util_format_snorm8_to_sint8(dst
->format
);
386 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, image
);
388 struct pipe_grid_info info
= {0};
390 if (dst
->target
== PIPE_TEXTURE_1D_ARRAY
&& src
->target
== PIPE_TEXTURE_1D_ARRAY
) {
391 if (!sctx
->cs_copy_image_1d_array
)
392 sctx
->cs_copy_image_1d_array
=
393 si_create_copy_image_compute_shader_1d_array(ctx
);
394 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_image_1d_array
);
396 info
.last_block
[0] = width
% 64;
399 info
.grid
[0] = DIV_ROUND_UP(width
, 64);
400 info
.grid
[1] = depth
;
403 if (!sctx
->cs_copy_image
)
404 sctx
->cs_copy_image
= si_create_copy_image_compute_shader(ctx
);
405 ctx
->bind_compute_state(ctx
, sctx
->cs_copy_image
);
407 info
.last_block
[0] = width
% 8;
409 info
.last_block
[1] = height
% 8;
411 info
.grid
[0] = DIV_ROUND_UP(width
, 8);
412 info
.grid
[1] = DIV_ROUND_UP(height
, 8);
413 info
.grid
[2] = depth
;
416 ctx
->launch_grid(ctx
, &info
);
418 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
419 (sctx
->chip_class
<= VI
? SI_CONTEXT_WRITEBACK_GLOBAL_L2
: 0) |
420 si_get_flush_flags(sctx
, SI_COHERENCY_SHADER
, L2_STREAM
);
421 ctx
->bind_compute_state(ctx
, saved_cs
);
422 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 2, saved_image
);
423 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
424 si_compute_internal_end(sctx
);
427 void si_retile_dcc(struct si_context
*sctx
, struct si_texture
*tex
)
429 struct pipe_context
*ctx
= &sctx
->b
;
431 sctx
->flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
|
432 SI_CONTEXT_CS_PARTIAL_FLUSH
|
433 si_get_flush_flags(sctx
, SI_COHERENCY_CB_META
, L2_LRU
) |
434 si_get_flush_flags(sctx
, SI_COHERENCY_SHADER
, L2_LRU
);
435 si_emit_cache_flush(sctx
);
438 void *saved_cs
= sctx
->cs_shader_state
.program
;
439 struct pipe_image_view saved_img
[3] = {};
441 for (unsigned i
= 0; i
< 3; i
++) {
442 util_copy_image_view(&saved_img
[i
],
443 &sctx
->images
[PIPE_SHADER_COMPUTE
].views
[i
]);
447 bool use_uint16
= tex
->surface
.u
.gfx9
.dcc_retile_use_uint16
;
448 unsigned num_elements
= tex
->surface
.u
.gfx9
.dcc_retile_num_elements
;
449 struct pipe_image_view img
[3];
451 assert(tex
->dcc_retile_map_offset
&& tex
->dcc_retile_map_offset
<= UINT_MAX
);
452 assert(tex
->dcc_offset
&& tex
->dcc_offset
<= UINT_MAX
);
453 assert(tex
->display_dcc_offset
&& tex
->display_dcc_offset
<= UINT_MAX
);
455 for (unsigned i
= 0; i
< 3; i
++) {
456 img
[i
].resource
= &tex
->buffer
.b
.b
;
457 img
[i
].access
= i
== 2 ? PIPE_IMAGE_ACCESS_WRITE
: PIPE_IMAGE_ACCESS_READ
;
458 img
[i
].shader_access
= SI_IMAGE_ACCESS_AS_BUFFER
;
461 img
[0].format
= use_uint16
? PIPE_FORMAT_R16G16B16A16_UINT
:
462 PIPE_FORMAT_R32G32B32A32_UINT
;
463 img
[0].u
.buf
.offset
= tex
->dcc_retile_map_offset
;
464 img
[0].u
.buf
.size
= num_elements
* (use_uint16
? 2 : 4);
466 img
[1].format
= PIPE_FORMAT_R8_UINT
;
467 img
[1].u
.buf
.offset
= tex
->dcc_offset
;
468 img
[1].u
.buf
.size
= tex
->surface
.dcc_size
;
470 img
[2].format
= PIPE_FORMAT_R8_UINT
;
471 img
[2].u
.buf
.offset
= tex
->display_dcc_offset
;
472 img
[2].u
.buf
.size
= tex
->surface
.u
.gfx9
.display_dcc_size
;
474 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 3, img
);
476 /* Bind the compute shader. */
477 if (!sctx
->cs_dcc_retile
)
478 sctx
->cs_dcc_retile
= si_create_dcc_retile_cs(ctx
);
479 ctx
->bind_compute_state(ctx
, sctx
->cs_dcc_retile
);
481 /* Dispatch compute. */
482 /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */
483 unsigned num_threads
= num_elements
/ 4;
485 struct pipe_grid_info info
= {};
489 info
.grid
[0] = DIV_ROUND_UP(num_threads
, 64); /* includes the partial block */
492 info
.last_block
[0] = num_threads
% 64;
494 ctx
->launch_grid(ctx
, &info
);
496 /* Don't flush caches or wait. The driver will wait at the end of this IB,
497 * and L2 will be flushed by the kernel fence.
500 /* Restore states. */
501 ctx
->bind_compute_state(ctx
, saved_cs
);
502 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 3, saved_img
);
505 void si_init_compute_blit_functions(struct si_context
*sctx
)
507 sctx
->b
.clear_buffer
= si_pipe_clear_buffer
;
510 /* Clear a region of a color surface to a constant value. */
511 void si_compute_clear_render_target(struct pipe_context
*ctx
,
512 struct pipe_surface
*dstsurf
,
513 const union pipe_color_union
*color
,
514 unsigned dstx
, unsigned dsty
,
515 unsigned width
, unsigned height
,
516 bool render_condition_enabled
)
518 struct si_context
*sctx
= (struct si_context
*)ctx
;
519 unsigned num_layers
= dstsurf
->u
.tex
.last_layer
- dstsurf
->u
.tex
.first_layer
+ 1;
520 unsigned data
[4 + sizeof(color
->ui
)] = {dstx
, dsty
, dstsurf
->u
.tex
.first_layer
, 0};
522 if (width
== 0 || height
== 0)
525 if (util_format_is_srgb(dstsurf
->format
)) {
526 union pipe_color_union color_srgb
;
527 for (int i
= 0; i
< 3; i
++)
528 color_srgb
.f
[i
] = util_format_linear_to_srgb_float(color
->f
[i
]);
529 color_srgb
.f
[3] = color
->f
[3];
530 memcpy(data
+ 4, color_srgb
.ui
, sizeof(color
->ui
));
532 memcpy(data
+ 4, color
->ui
, sizeof(color
->ui
));
535 si_compute_internal_begin(sctx
);
536 sctx
->render_cond_force_off
= !render_condition_enabled
;
538 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
539 si_get_flush_flags(sctx
, SI_COHERENCY_SHADER
, L2_STREAM
);
540 si_make_CB_shader_coherent(sctx
, dstsurf
->texture
->nr_samples
, true,
541 true /* DCC is not possible with image stores */);
543 struct pipe_constant_buffer saved_cb
= {};
544 si_get_pipe_constant_buffer(sctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
546 struct si_images
*images
= &sctx
->images
[PIPE_SHADER_COMPUTE
];
547 struct pipe_image_view saved_image
= {0};
548 util_copy_image_view(&saved_image
, &images
->views
[0]);
550 void *saved_cs
= sctx
->cs_shader_state
.program
;
552 struct pipe_constant_buffer cb
= {};
553 cb
.buffer_size
= sizeof(data
);
554 cb
.user_buffer
= data
;
555 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &cb
);
557 struct pipe_image_view image
= {0};
558 image
.resource
= dstsurf
->texture
;
559 image
.shader_access
= image
.access
= PIPE_IMAGE_ACCESS_WRITE
;
560 image
.format
= util_format_linear(dstsurf
->format
);
561 image
.u
.tex
.level
= dstsurf
->u
.tex
.level
;
562 image
.u
.tex
.first_layer
= 0; /* 3D images ignore first_layer (BASE_ARRAY) */
563 image
.u
.tex
.last_layer
= dstsurf
->u
.tex
.last_layer
;
565 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, &image
);
567 struct pipe_grid_info info
= {0};
569 if (dstsurf
->texture
->target
!= PIPE_TEXTURE_1D_ARRAY
) {
570 if (!sctx
->cs_clear_render_target
)
571 sctx
->cs_clear_render_target
= si_clear_render_target_shader(ctx
);
572 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_render_target
);
574 info
.last_block
[0] = width
% 8;
576 info
.last_block
[1] = height
% 8;
578 info
.grid
[0] = DIV_ROUND_UP(width
, 8);
579 info
.grid
[1] = DIV_ROUND_UP(height
, 8);
580 info
.grid
[2] = num_layers
;
582 if (!sctx
->cs_clear_render_target_1d_array
)
583 sctx
->cs_clear_render_target_1d_array
=
584 si_clear_render_target_shader_1d_array(ctx
);
585 ctx
->bind_compute_state(ctx
, sctx
->cs_clear_render_target_1d_array
);
587 info
.last_block
[0] = width
% 64;
590 info
.grid
[0] = DIV_ROUND_UP(width
, 64);
591 info
.grid
[1] = num_layers
;
595 ctx
->launch_grid(ctx
, &info
);
597 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
|
598 (sctx
->chip_class
<= VI
? SI_CONTEXT_WRITEBACK_GLOBAL_L2
: 0) |
599 si_get_flush_flags(sctx
, SI_COHERENCY_SHADER
, L2_STREAM
);
600 ctx
->bind_compute_state(ctx
, saved_cs
);
601 ctx
->set_shader_images(ctx
, PIPE_SHADER_COMPUTE
, 0, 1, &saved_image
);
602 ctx
->set_constant_buffer(ctx
, PIPE_SHADER_COMPUTE
, 0, &saved_cb
);
603 si_compute_internal_end(sctx
);