- struct pipe_context *ctx = &sctx->b;
- unsigned width = src_box->width;
- unsigned height = src_box->height;
- unsigned depth = src_box->depth;
- enum pipe_format src_format = util_format_linear(src->format);
- enum pipe_format dst_format = util_format_linear(dst->format);
-
- assert(util_format_is_subsampled_422(src_format) ==
- util_format_is_subsampled_422(dst_format));
-
- if (util_format_is_subsampled_422(src_format)) {
- src_format = dst_format = PIPE_FORMAT_R32_UINT;
- /* Interpreting 422 subsampled format (16 bpp) as 32 bpp
- * should force us to divide src_box->x, dstx and width by 2.
- * But given that ac_surface allocates this format as 32 bpp
- * and that surf_size is then modified to pack the values
- * we must keep the original values to get the correct results.
- */
- }
- unsigned data[] = {src_box->x, src_box->y, src_box->z, 0,
- dstx, dsty, dstz, 0};
-
- if (width == 0 || height == 0)
- return;
-
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
- si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-
- /* The driver doesn't decompress resources automatically here. */
- si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level,
- dstz, dstz + src_box->depth - 1);
- si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level,
- src_box->z, src_box->z + src_box->depth - 1);
-
- /* src and dst have the same number of samples. */
- si_make_CB_shader_coherent(sctx, src->nr_samples, true,
- /* Only src can have DCC.*/
- ((struct si_texture*)src)->surface.u.gfx9.dcc.pipe_aligned);
-
- struct pipe_constant_buffer saved_cb = {};
- si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
-
- struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
- struct pipe_image_view saved_image[2] = {0};
- util_copy_image_view(&saved_image[0], &images->views[0]);
- util_copy_image_view(&saved_image[1], &images->views[1]);
-
- void *saved_cs = sctx->cs_shader_state.program;
-
- struct pipe_constant_buffer cb = {};
- cb.buffer_size = sizeof(data);
- cb.user_buffer = data;
- ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
-
- struct pipe_image_view image[2] = {0};
- image[0].resource = src;
- image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
- image[0].format = src_format;
- image[0].u.tex.level = src_level;
- image[0].u.tex.first_layer = 0;
- image[0].u.tex.last_layer =
- src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
- : (unsigned)(src->array_size - 1);
- image[1].resource = dst;
- image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
- image[1].format = dst_format;
- image[1].u.tex.level = dst_level;
- image[1].u.tex.first_layer = 0;
- image[1].u.tex.last_layer =
- dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
- : (unsigned)(dst->array_size - 1);
-
- if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
- image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
-
- /* SNORM8 blitting has precision issues on some chips. Use the SINT
- * equivalent instead, which doesn't force DCC decompression.
- * Note that some chips avoid this issue by using SDMA.
- */
- if (util_format_is_snorm8(dst->format)) {
- image[0].format = image[1].format =
- util_format_snorm8_to_sint8(dst->format);
- }
-
- ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
-
- struct pipe_grid_info info = {0};
-
- if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
- if (!sctx->cs_copy_image_1d_array)
- sctx->cs_copy_image_1d_array =
- si_create_copy_image_compute_shader_1d_array(ctx);
- ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
- info.block[0] = 64;
- info.last_block[0] = width % 64;
- info.block[1] = 1;
- info.block[2] = 1;
- info.grid[0] = DIV_ROUND_UP(width, 64);
- info.grid[1] = depth;
- info.grid[2] = 1;
- } else {
- if (!sctx->cs_copy_image)
- sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
- ctx->bind_compute_state(ctx, sctx->cs_copy_image);
- info.block[0] = 8;
- info.last_block[0] = width % 8;
- info.block[1] = 8;
- info.last_block[1] = height % 8;
- info.block[2] = 1;
- info.grid[0] = DIV_ROUND_UP(width, 8);
- info.grid[1] = DIV_ROUND_UP(height, 8);
- info.grid[2] = depth;
- }
-
- si_launch_grid_internal(sctx, &info);
-
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
- (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
- si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
- ctx->bind_compute_state(ctx, saved_cs);
- ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
- ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
- for (int i = 0; i < 2; i++)
- pipe_resource_reference(&saved_image[i].resource, NULL);
- pipe_resource_reference(&saved_cb.buffer, NULL);
+ struct pipe_context *ctx = &sctx->b;
+ unsigned width = src_box->width;
+ unsigned height = src_box->height;
+ unsigned depth = src_box->depth;
+ enum pipe_format src_format = util_format_linear(src->format);
+ enum pipe_format dst_format = util_format_linear(dst->format);
+ bool is_linear = ((struct si_texture*)src)->surface.is_linear ||
+ ((struct si_texture*)dst)->surface.is_linear;
+
+ assert(util_format_is_subsampled_422(src_format) == util_format_is_subsampled_422(dst_format));
+
+ if (!vi_dcc_enabled((struct si_texture*)src, src_level) &&
+ src_format == dst_format &&
+ util_format_is_float(src_format) &&
+ !util_format_is_compressed(src_format)) {
+ /* Interpret as integer values to avoid NaN issues */
+ switch(util_format_get_blocksizebits(src_format)) {
+ case 16:
+ src_format = dst_format = PIPE_FORMAT_R16_UINT;
+ break;
+ case 32:
+ src_format = dst_format = PIPE_FORMAT_R32_UINT;
+ break;
+ case 64:
+ src_format = dst_format = PIPE_FORMAT_R32G32_UINT;
+ break;
+ case 128:
+ src_format = dst_format = PIPE_FORMAT_R32G32B32A32_UINT;
+ break;
+ default:
+ assert(false);
+ }
+ }
+
+ if (util_format_is_subsampled_422(src_format)) {
+ src_format = dst_format = PIPE_FORMAT_R32_UINT;
+ /* Interpreting 422 subsampled format (16 bpp) as 32 bpp
+ * should force us to divide src_box->x, dstx and width by 2.
+ * But given that ac_surface allocates this format as 32 bpp
+ * and that surf_size is then modified to pack the values
+ * we must keep the original values to get the correct results.
+ */
+ }
+
+ if (width == 0 || height == 0)
+ return;
+
+ /* The driver doesn't decompress resources automatically here. */
+ si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, dstz,
+ dstz + src_box->depth - 1);
+ si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z,
+ src_box->z + src_box->depth - 1);
+
+ /* src and dst have the same number of samples. */
+ si_make_CB_shader_coherent(sctx, src->nr_samples, true,
+ /* Only src can have DCC.*/
+ ((struct si_texture *)src)->surface.u.gfx9.dcc.pipe_aligned);
+
+ struct pipe_constant_buffer saved_cb = {};
+
+ struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
+ struct pipe_image_view saved_image[2] = {0};
+ util_copy_image_view(&saved_image[0], &images->views[0]);
+ util_copy_image_view(&saved_image[1], &images->views[1]);
+
+ void *saved_cs = sctx->cs_shader_state.program;
+
+ if (!is_dcc_decompress) {
+ unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0};
+
+ si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+
+ struct pipe_constant_buffer cb = {};
+ cb.buffer_size = sizeof(data);
+ cb.user_buffer = data;
+ ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
+ }
+
+ struct pipe_image_view image[2] = {0};
+ image[0].resource = src;
+ image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
+ image[0].format = src_format;
+ image[0].u.tex.level = src_level;
+ image[0].u.tex.first_layer = 0;
+ image[0].u.tex.last_layer = src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
+ : (unsigned)(src->array_size - 1);
+ image[1].resource = dst;
+ image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
+ image[1].format = dst_format;
+ image[1].u.tex.level = dst_level;
+ image[1].u.tex.first_layer = 0;
+ image[1].u.tex.last_layer = dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
+ : (unsigned)(dst->array_size - 1);
+
+ /* SNORM8 blitting has precision issues on some chips. Use the SINT
+ * equivalent instead, which doesn't force DCC decompression.
+ * Note that some chips avoid this issue by using SDMA.
+ */
+ if (util_format_is_snorm8(dst->format)) {
+ image[0].format = image[1].format = util_format_snorm8_to_sint8(dst->format);
+ }
+
+ if (is_dcc_decompress)
+ image[1].access |= SI_IMAGE_ACCESS_DCC_OFF;
+
+ ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
+
+ struct pipe_grid_info info = {0};
+
+ if (is_dcc_decompress) {
+ /* The DCC decompression is a normal blit where the load is compressed
+ * and the store is uncompressed. The workgroup size is either equal to
+ * the DCC block size or a multiple thereof. The shader uses a barrier
+ * between loads and stores to safely overwrite each DCC block of pixels.
+ */
+ struct si_texture *tex = (struct si_texture*)src;
+ unsigned dim[3] = {src_box->width, src_box->height, src_box->depth};
+
+ assert(src == dst);
+ assert(dst->target != PIPE_TEXTURE_1D && dst->target != PIPE_TEXTURE_1D_ARRAY);
+
+ if (!sctx->cs_dcc_decompress)
+ sctx->cs_dcc_decompress = si_create_dcc_decompress_cs(ctx);
+ ctx->bind_compute_state(ctx, sctx->cs_dcc_decompress);
+
+ info.block[0] = tex->surface.u.gfx9.dcc_block_width;
+ info.block[1] = tex->surface.u.gfx9.dcc_block_height;
+ info.block[2] = tex->surface.u.gfx9.dcc_block_depth;
+
+ /* Make sure the block size is at least the same as wave size. */
+ while (info.block[0] * info.block[1] * info.block[2] <
+ sctx->screen->compute_wave_size) {
+ info.block[0] *= 2;
+ }
+
+ for (unsigned i = 0; i < 3; i++) {
+ info.last_block[i] = dim[i] % info.block[i];
+ info.grid[i] = DIV_ROUND_UP(dim[i], info.block[i]);
+ }
+ } else if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
+ if (!sctx->cs_copy_image_1d_array)
+ sctx->cs_copy_image_1d_array = si_create_copy_image_compute_shader_1d_array(ctx);
+ ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
+ info.block[0] = 64;
+ info.last_block[0] = width % 64;
+ info.block[1] = 1;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(width, 64);
+ info.grid[1] = depth;
+ info.grid[2] = 1;
+ } else {
+ if (!sctx->cs_copy_image)
+ sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
+ ctx->bind_compute_state(ctx, sctx->cs_copy_image);
+
+ /* This is better for access over PCIe. */
+ if (is_linear) {
+ info.block[0] = 64;
+ info.block[1] = 1;
+ } else {
+ info.block[0] = 8;
+ info.block[1] = 8;
+ }
+ info.last_block[0] = width % info.block[0];
+ info.last_block[1] = height % info.block[1];
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(width, info.block[0]);
+ info.grid[1] = DIV_ROUND_UP(height, info.block[1]);
+ info.grid[2] = depth;
+ }
+
+ si_launch_grid_internal(sctx, &info, saved_cs,
+ SI_CS_WAIT_FOR_IDLE | SI_CS_IMAGE_OP);
+
+ ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
+ for (int i = 0; i < 2; i++)
+ pipe_resource_reference(&saved_image[i].resource, NULL);
+ if (!is_dcc_decompress) {
+ ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+ pipe_resource_reference(&saved_cb.buffer, NULL);
+ }