radeonsi: use pipe_blend_state::max_rt to update fewer blend registers
[mesa.git] / src / gallium / drivers / radeonsi / si_compute_blit.c
1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 */
25
26 #include "si_pipe.h"
27 #include "util/format/u_format.h"
28 #include "util/format_srgb.h"
29
30 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
31 * and L2_STREAM for src.
32 */
33 static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher,
34 uint64_t size)
35 {
36 if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META || coher == SI_COHERENCY_CP)) ||
37 (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
38 return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
39
40 return L2_BYPASS;
41 }
42
43 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
44 enum si_cache_policy cache_policy)
45 {
46 switch (coher) {
47 default:
48 case SI_COHERENCY_NONE:
49 case SI_COHERENCY_CP:
50 return 0;
51 case SI_COHERENCY_SHADER:
52 return SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
53 (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0);
54 case SI_COHERENCY_CB_META:
55 return SI_CONTEXT_FLUSH_AND_INV_CB;
56 }
57 }
58
59 static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info)
60 {
61 /* Set settings for driver-internal compute dispatches. */
62 sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
63 sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
64 sctx->render_cond_force_off = true;
65 /* Skip decompression to prevent infinite recursion. */
66 if (sctx->blitter)
67 sctx->blitter->running = true;
68
69 /* Dispatch compute. */
70 sctx->b.launch_grid(&sctx->b, info);
71
72 /* Restore default settings. */
73 sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
74 sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
75 sctx->render_cond_force_off = false;
76 if (sctx->blitter)
77 sctx->blitter->running = false;
78 }
79
80 static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst,
81 unsigned dst_offset, unsigned size,
82 const uint32_t *clear_value, enum si_coherency coher)
83 {
84 struct pipe_context *ctx = &sctx->b;
85
86 assert(dst_offset % 4 == 0);
87 assert(size % 4 == 0);
88 unsigned size_12 = DIV_ROUND_UP(size, 12);
89
90 unsigned data[4] = {0};
91 memcpy(data, clear_value, 12);
92
93 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
94 si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
95
96 struct pipe_shader_buffer saved_sb = {0};
97 si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
98
99 unsigned saved_writable_mask = 0;
100 if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
101 (1u << si_get_shaderbuf_slot(0)))
102 saved_writable_mask = 1;
103
104 struct pipe_constant_buffer saved_cb = {};
105 si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
106
107 void *saved_cs = sctx->cs_shader_state.program;
108
109 struct pipe_constant_buffer cb = {};
110 cb.buffer_size = sizeof(data);
111 cb.user_buffer = data;
112 ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
113
114 struct pipe_shader_buffer sb = {0};
115 sb.buffer = dst;
116 sb.buffer_offset = dst_offset;
117 sb.buffer_size = size;
118
119 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
120
121 struct pipe_grid_info info = {0};
122
123 if (!sctx->cs_clear_12bytes_buffer)
124 sctx->cs_clear_12bytes_buffer = si_clear_12bytes_buffer_shader(ctx);
125 ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer);
126 info.block[0] = 64;
127 info.last_block[0] = size_12 % 64;
128 info.block[1] = 1;
129 info.block[2] = 1;
130 info.grid[0] = DIV_ROUND_UP(size_12, 64);
131 info.grid[1] = 1;
132 info.grid[2] = 1;
133
134 si_launch_grid_internal(sctx, &info);
135
136 ctx->bind_compute_state(ctx, saved_cs);
137 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
138 ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
139
140 pipe_resource_reference(&saved_sb.buffer, NULL);
141 pipe_resource_reference(&saved_cb.buffer, NULL);
142 }
143
144 static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_resource *dst,
145 unsigned dst_offset, struct pipe_resource *src,
146 unsigned src_offset, unsigned size,
147 const uint32_t *clear_value, unsigned clear_value_size,
148 enum si_coherency coher)
149 {
150 struct pipe_context *ctx = &sctx->b;
151
152 assert(src_offset % 4 == 0);
153 assert(dst_offset % 4 == 0);
154 assert(size % 4 == 0);
155
156 assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
157 assert(!src || src_offset + size <= src->width0);
158
159 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
160 si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
161
162 /* Save states. */
163 void *saved_cs = sctx->cs_shader_state.program;
164 struct pipe_shader_buffer saved_sb[2] = {};
165 si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
166
167 unsigned saved_writable_mask = 0;
168 for (unsigned i = 0; i < (src ? 2 : 1); i++) {
169 if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
170 (1u << si_get_shaderbuf_slot(i)))
171 saved_writable_mask |= 1 << i;
172 }
173
174 /* The memory accesses are coalesced, meaning that the 1st instruction writes
175 * the 1st contiguous block of data for the whole wave, the 2nd instruction
176 * writes the 2nd contiguous block of data, etc.
177 */
178 unsigned dwords_per_thread =
179 src ? SI_COMPUTE_COPY_DW_PER_THREAD : SI_COMPUTE_CLEAR_DW_PER_THREAD;
180 unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
181 unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
182 unsigned wave_size = sctx->screen->compute_wave_size;
183 unsigned dwords_per_wave = dwords_per_thread * wave_size;
184
185 unsigned num_dwords = size / 4;
186 unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
187
188 struct pipe_grid_info info = {};
189 info.block[0] = MIN2(wave_size, num_instructions);
190 info.block[1] = 1;
191 info.block[2] = 1;
192 info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
193 info.grid[1] = 1;
194 info.grid[2] = 1;
195
196 struct pipe_shader_buffer sb[2] = {};
197 sb[0].buffer = dst;
198 sb[0].buffer_offset = dst_offset;
199 sb[0].buffer_size = size;
200
201 bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
202
203 if (src) {
204 sb[1].buffer = src;
205 sb[1].buffer_offset = src_offset;
206 sb[1].buffer_size = size;
207
208 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1);
209
210 if (!sctx->cs_copy_buffer) {
211 sctx->cs_copy_buffer = si_create_dma_compute_shader(
212 &sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
213 }
214 ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
215 } else {
216 assert(clear_value_size >= 4 && clear_value_size <= 16 &&
217 util_is_power_of_two_or_zero(clear_value_size));
218
219 for (unsigned i = 0; i < 4; i++)
220 sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
221
222 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1);
223
224 if (!sctx->cs_clear_buffer) {
225 sctx->cs_clear_buffer = si_create_dma_compute_shader(
226 &sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
227 }
228 ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
229 }
230
231 si_launch_grid_internal(sctx, &info);
232
233 enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
234 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0);
235
236 if (cache_policy != L2_BYPASS)
237 si_resource(dst)->TC_L2_dirty = true;
238
239 /* Restore states. */
240 ctx->bind_compute_state(ctx, saved_cs);
241 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb, saved_writable_mask);
242 for (int i = 0; i < 2; i++)
243 pipe_resource_reference(&saved_sb[i].buffer, NULL);
244 }
245
246 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
247 uint64_t size, uint32_t *clear_value, uint32_t clear_value_size,
248 enum si_coherency coher, bool force_cpdma)
249 {
250 if (!size)
251 return;
252
253 ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4);
254
255 assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
256 assert(offset % clear_alignment == 0);
257 assert(size % clear_alignment == 0);
258 assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
259
260 /* Reduce a large clear value size if possible. */
261 if (clear_value_size > 4) {
262 bool clear_dword_duplicated = true;
263
264 /* See if we can lower large fills to dword fills. */
265 for (unsigned i = 1; i < clear_value_size / 4; i++) {
266 if (clear_value[0] != clear_value[i]) {
267 clear_dword_duplicated = false;
268 break;
269 }
270 }
271 if (clear_dword_duplicated)
272 clear_value_size = 4;
273 }
274
275 /* Expand a small clear value size. */
276 uint32_t tmp_clear_value;
277 if (clear_value_size <= 2) {
278 if (clear_value_size == 1) {
279 tmp_clear_value = *(uint8_t *)clear_value;
280 tmp_clear_value |=
281 (tmp_clear_value << 8) | (tmp_clear_value << 16) | (tmp_clear_value << 24);
282 } else {
283 tmp_clear_value = *(uint16_t *)clear_value;
284 tmp_clear_value |= tmp_clear_value << 16;
285 }
286 clear_value = &tmp_clear_value;
287 clear_value_size = 4;
288 }
289
290 if (clear_value_size == 12) {
291 si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher);
292 return;
293 }
294
295 uint64_t aligned_size = size & ~3ull;
296 if (aligned_size >= 4) {
297 /* Before GFX9, CP DMA was very slow when clearing GTT, so never
298 * use CP DMA clears on those chips, because we can't be certain
299 * about buffer placements.
300 */
301 if (clear_value_size > 4 || (!force_cpdma && clear_value_size == 4 && offset % 4 == 0 &&
302 (size > 32 * 1024 || sctx->chip_class <= GFX9))) {
303 si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value,
304 clear_value_size, coher);
305 } else {
306 assert(clear_value_size == 4);
307 si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset, aligned_size, *clear_value, 0,
308 coher, get_cache_policy(sctx, coher, size));
309 }
310
311 offset += aligned_size;
312 size -= aligned_size;
313 }
314
315 /* Handle non-dword alignment. */
316 if (size) {
317 assert(dst);
318 assert(dst->target == PIPE_BUFFER);
319 assert(size < 4);
320
321 pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
322 }
323 }
324
325 static void si_pipe_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
326 unsigned offset, unsigned size, const void *clear_value,
327 int clear_value_size)
328 {
329 si_clear_buffer((struct si_context *)ctx, dst, offset, size, (uint32_t *)clear_value,
330 clear_value_size, SI_COHERENCY_SHADER, false);
331 }
332
333 void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
334 uint64_t dst_offset, uint64_t src_offset, unsigned size)
335 {
336 if (!size)
337 return;
338
339 enum si_coherency coher = SI_COHERENCY_SHADER;
340 enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
341
342 /* Only use compute for VRAM copies on dGPUs. */
343 if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
344 si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > 32 * 1024 && dst_offset % 4 == 0 &&
345 src_offset % 4 == 0 && size % 4 == 0) {
346 si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0, coher);
347 } else {
348 si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, 0, coher, cache_policy);
349 }
350 }
351
352 void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
353 struct pipe_resource *src, unsigned src_level, unsigned dstx,
354 unsigned dsty, unsigned dstz, const struct pipe_box *src_box)
355 {
356 struct pipe_context *ctx = &sctx->b;
357 unsigned width = src_box->width;
358 unsigned height = src_box->height;
359 unsigned depth = src_box->depth;
360 enum pipe_format src_format = util_format_linear(src->format);
361 enum pipe_format dst_format = util_format_linear(dst->format);
362
363 assert(util_format_is_subsampled_422(src_format) == util_format_is_subsampled_422(dst_format));
364
365 if (util_format_is_subsampled_422(src_format)) {
366 src_format = dst_format = PIPE_FORMAT_R32_UINT;
367 /* Interpreting 422 subsampled format (16 bpp) as 32 bpp
368 * should force us to divide src_box->x, dstx and width by 2.
369 * But given that ac_surface allocates this format as 32 bpp
370 * and that surf_size is then modified to pack the values
371 * we must keep the original values to get the correct results.
372 */
373 }
374 unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0};
375
376 if (width == 0 || height == 0)
377 return;
378
379 sctx->flags |=
380 SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
381
382 /* The driver doesn't decompress resources automatically here. */
383 si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, dstz,
384 dstz + src_box->depth - 1);
385 si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z,
386 src_box->z + src_box->depth - 1);
387
388 /* src and dst have the same number of samples. */
389 si_make_CB_shader_coherent(sctx, src->nr_samples, true,
390 /* Only src can have DCC.*/
391 ((struct si_texture *)src)->surface.u.gfx9.dcc.pipe_aligned);
392
393 struct pipe_constant_buffer saved_cb = {};
394 si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
395
396 struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
397 struct pipe_image_view saved_image[2] = {0};
398 util_copy_image_view(&saved_image[0], &images->views[0]);
399 util_copy_image_view(&saved_image[1], &images->views[1]);
400
401 void *saved_cs = sctx->cs_shader_state.program;
402
403 struct pipe_constant_buffer cb = {};
404 cb.buffer_size = sizeof(data);
405 cb.user_buffer = data;
406 ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
407
408 struct pipe_image_view image[2] = {0};
409 image[0].resource = src;
410 image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
411 image[0].format = src_format;
412 image[0].u.tex.level = src_level;
413 image[0].u.tex.first_layer = 0;
414 image[0].u.tex.last_layer = src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
415 : (unsigned)(src->array_size - 1);
416 image[1].resource = dst;
417 image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
418 image[1].format = dst_format;
419 image[1].u.tex.level = dst_level;
420 image[1].u.tex.first_layer = 0;
421 image[1].u.tex.last_layer = dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
422 : (unsigned)(dst->array_size - 1);
423
424 if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
425 image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
426
427 /* SNORM8 blitting has precision issues on some chips. Use the SINT
428 * equivalent instead, which doesn't force DCC decompression.
429 * Note that some chips avoid this issue by using SDMA.
430 */
431 if (util_format_is_snorm8(dst->format)) {
432 image[0].format = image[1].format = util_format_snorm8_to_sint8(dst->format);
433 }
434
435 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
436
437 struct pipe_grid_info info = {0};
438
439 if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
440 if (!sctx->cs_copy_image_1d_array)
441 sctx->cs_copy_image_1d_array = si_create_copy_image_compute_shader_1d_array(ctx);
442 ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
443 info.block[0] = 64;
444 info.last_block[0] = width % 64;
445 info.block[1] = 1;
446 info.block[2] = 1;
447 info.grid[0] = DIV_ROUND_UP(width, 64);
448 info.grid[1] = depth;
449 info.grid[2] = 1;
450 } else {
451 if (!sctx->cs_copy_image)
452 sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
453 ctx->bind_compute_state(ctx, sctx->cs_copy_image);
454 info.block[0] = 8;
455 info.last_block[0] = width % 8;
456 info.block[1] = 8;
457 info.last_block[1] = height % 8;
458 info.block[2] = 1;
459 info.grid[0] = DIV_ROUND_UP(width, 8);
460 info.grid[1] = DIV_ROUND_UP(height, 8);
461 info.grid[2] = depth;
462 }
463
464 si_launch_grid_internal(sctx, &info);
465
466 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
467 si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
468 ctx->bind_compute_state(ctx, saved_cs);
469 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
470 ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
471 for (int i = 0; i < 2; i++)
472 pipe_resource_reference(&saved_image[i].resource, NULL);
473 pipe_resource_reference(&saved_cb.buffer, NULL);
474 }
475
476 void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
477 {
478 struct pipe_context *ctx = &sctx->b;
479
480 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
481 si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) |
482 si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU);
483 sctx->emit_cache_flush(sctx);
484
485 /* Save states. */
486 void *saved_cs = sctx->cs_shader_state.program;
487 struct pipe_image_view saved_img[3] = {};
488
489 for (unsigned i = 0; i < 3; i++) {
490 util_copy_image_view(&saved_img[i], &sctx->images[PIPE_SHADER_COMPUTE].views[i]);
491 }
492
493 /* Set images. */
494 bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
495 unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
496 struct pipe_image_view img[3];
497
498 assert(tex->surface.dcc_retile_map_offset && tex->surface.dcc_retile_map_offset <= UINT_MAX);
499 assert(tex->surface.dcc_offset && tex->surface.dcc_offset <= UINT_MAX);
500 assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX);
501
502 for (unsigned i = 0; i < 3; i++) {
503 img[i].resource = &tex->buffer.b.b;
504 img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ;
505 img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER;
506 }
507
508 img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT : PIPE_FORMAT_R32G32B32A32_UINT;
509 img[0].u.buf.offset = tex->surface.dcc_retile_map_offset;
510 img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4);
511
512 img[1].format = PIPE_FORMAT_R8_UINT;
513 img[1].u.buf.offset = tex->surface.dcc_offset;
514 img[1].u.buf.size = tex->surface.dcc_size;
515
516 img[2].format = PIPE_FORMAT_R8_UINT;
517 img[2].u.buf.offset = tex->surface.display_dcc_offset;
518 img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size;
519
520 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img);
521
522 /* Bind the compute shader. */
523 if (!sctx->cs_dcc_retile)
524 sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx);
525 ctx->bind_compute_state(ctx, sctx->cs_dcc_retile);
526
527 /* Dispatch compute. */
528 /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */
529 unsigned num_threads = num_elements / 4;
530
531 struct pipe_grid_info info = {};
532 info.block[0] = 64;
533 info.block[1] = 1;
534 info.block[2] = 1;
535 info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */
536 info.grid[1] = 1;
537 info.grid[2] = 1;
538 info.last_block[0] = num_threads % 64;
539
540 si_launch_grid_internal(sctx, &info);
541
542 /* Don't flush caches or wait. The driver will wait at the end of this IB,
543 * and L2 will be flushed by the kernel fence.
544 */
545
546 /* Restore states. */
547 ctx->bind_compute_state(ctx, saved_cs);
548 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img);
549
550 for (unsigned i = 0; i < 3; i++) {
551 pipe_resource_reference(&saved_img[i].resource, NULL);
552 }
553 }
554
555 /* Expand FMASK to make it identity, so that image stores can ignore it. */
556 void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex)
557 {
558 struct si_context *sctx = (struct si_context *)ctx;
559 bool is_array = tex->target == PIPE_TEXTURE_2D_ARRAY;
560 unsigned log_fragments = util_logbase2(tex->nr_storage_samples);
561 unsigned log_samples = util_logbase2(tex->nr_samples);
562 assert(tex->nr_samples >= 2);
563
564 /* EQAA FMASK expansion is unimplemented. */
565 if (tex->nr_samples != tex->nr_storage_samples)
566 return;
567
568 /* Flush caches and sync engines. */
569 sctx->flags |=
570 SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
571 si_make_CB_shader_coherent(sctx, tex->nr_samples, true,
572 true /* DCC is not possible with image stores */);
573
574 /* Save states. */
575 void *saved_cs = sctx->cs_shader_state.program;
576 struct pipe_image_view saved_image = {0};
577 util_copy_image_view(&saved_image, &sctx->images[PIPE_SHADER_COMPUTE].views[0]);
578
579 /* Bind the image. */
580 struct pipe_image_view image = {0};
581 image.resource = tex;
582 /* Don't set WRITE so as not to trigger FMASK expansion, causing
583 * an infinite loop. */
584 image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ;
585 image.format = util_format_linear(tex->format);
586 if (is_array)
587 image.u.tex.last_layer = tex->array_size - 1;
588
589 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
590
591 /* Bind the shader. */
592 void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array];
593 if (!*shader)
594 *shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array);
595 ctx->bind_compute_state(ctx, *shader);
596
597 /* Dispatch compute. */
598 struct pipe_grid_info info = {0};
599 info.block[0] = 8;
600 info.last_block[0] = tex->width0 % 8;
601 info.block[1] = 8;
602 info.last_block[1] = tex->height0 % 8;
603 info.block[2] = 1;
604 info.grid[0] = DIV_ROUND_UP(tex->width0, 8);
605 info.grid[1] = DIV_ROUND_UP(tex->height0, 8);
606 info.grid[2] = is_array ? tex->array_size : 1;
607
608 si_launch_grid_internal(sctx, &info);
609
610 /* Flush caches and sync engines. */
611 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
612 si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
613
614 /* Restore previous states. */
615 ctx->bind_compute_state(ctx, saved_cs);
616 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
617 pipe_resource_reference(&saved_image.resource, NULL);
618
619 /* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */
620 #define INVALID 0 /* never used */
621 static const uint64_t fmask_expand_values[][4] = {
622 /* samples */
623 /* 2 (8 bpp) 4 (8 bpp) 8 (8-32bpp) 16 (16-64bpp) fragments */
624 {0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE}, /* 1 */
625 {0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4}, /* 2 */
626 {INVALID, 0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */
627 {INVALID, INVALID, 0x76543210, 0x8888888876543210}, /* 8 */
628 };
629
630 /* Clear FMASK to identity. */
631 struct si_texture *stex = (struct si_texture *)tex;
632 si_clear_buffer(sctx, tex, stex->surface.fmask_offset, stex->surface.fmask_size,
633 (uint32_t *)&fmask_expand_values[log_fragments][log_samples - 1], 4,
634 SI_COHERENCY_SHADER, false);
635 }
636
637 void si_init_compute_blit_functions(struct si_context *sctx)
638 {
639 sctx->b.clear_buffer = si_pipe_clear_buffer;
640 }
641
642 /* Clear a region of a color surface to a constant value. */
643 void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf,
644 const union pipe_color_union *color, unsigned dstx,
645 unsigned dsty, unsigned width, unsigned height,
646 bool render_condition_enabled)
647 {
648 struct si_context *sctx = (struct si_context *)ctx;
649 unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1;
650 unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0};
651
652 if (width == 0 || height == 0)
653 return;
654
655 /* The driver doesn't decompress resources automatically here. */
656 si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA, dstsurf->u.tex.level,
657 dstsurf->u.tex.first_layer, dstsurf->u.tex.last_layer);
658
659 if (util_format_is_srgb(dstsurf->format)) {
660 union pipe_color_union color_srgb;
661 for (int i = 0; i < 3; i++)
662 color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
663 color_srgb.f[3] = color->f[3];
664 memcpy(data + 4, color_srgb.ui, sizeof(color->ui));
665 } else {
666 memcpy(data + 4, color->ui, sizeof(color->ui));
667 }
668
669 sctx->render_cond_force_off = !render_condition_enabled;
670
671 sctx->flags |=
672 SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
673 si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true,
674 true /* DCC is not possible with image stores */);
675
676 struct pipe_constant_buffer saved_cb = {};
677 si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
678
679 struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
680 struct pipe_image_view saved_image = {0};
681 util_copy_image_view(&saved_image, &images->views[0]);
682
683 void *saved_cs = sctx->cs_shader_state.program;
684
685 struct pipe_constant_buffer cb = {};
686 cb.buffer_size = sizeof(data);
687 cb.user_buffer = data;
688 ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
689
690 struct pipe_image_view image = {0};
691 image.resource = dstsurf->texture;
692 image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE;
693 image.format = util_format_linear(dstsurf->format);
694 image.u.tex.level = dstsurf->u.tex.level;
695 image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */
696 image.u.tex.last_layer = dstsurf->u.tex.last_layer;
697
698 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
699
700 struct pipe_grid_info info = {0};
701
702 if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) {
703 if (!sctx->cs_clear_render_target)
704 sctx->cs_clear_render_target = si_clear_render_target_shader(ctx);
705 ctx->bind_compute_state(ctx, sctx->cs_clear_render_target);
706 info.block[0] = 8;
707 info.last_block[0] = width % 8;
708 info.block[1] = 8;
709 info.last_block[1] = height % 8;
710 info.block[2] = 1;
711 info.grid[0] = DIV_ROUND_UP(width, 8);
712 info.grid[1] = DIV_ROUND_UP(height, 8);
713 info.grid[2] = num_layers;
714 } else {
715 if (!sctx->cs_clear_render_target_1d_array)
716 sctx->cs_clear_render_target_1d_array = si_clear_render_target_shader_1d_array(ctx);
717 ctx->bind_compute_state(ctx, sctx->cs_clear_render_target_1d_array);
718 info.block[0] = 64;
719 info.last_block[0] = width % 64;
720 info.block[1] = 1;
721 info.block[2] = 1;
722 info.grid[0] = DIV_ROUND_UP(width, 64);
723 info.grid[1] = num_layers;
724 info.grid[2] = 1;
725 }
726
727 si_launch_grid_internal(sctx, &info);
728
729 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
730 si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
731 ctx->bind_compute_state(ctx, saved_cs);
732 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
733 ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
734 pipe_resource_reference(&saved_image.resource, NULL);
735 pipe_resource_reference(&saved_cb.buffer, NULL);
736 }