radeonsi: switch to 3-spaces style

author Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>

Fri, 27 Mar 2020 18:32:38 +0000 (19:32 +0100)

committer Marge Bot <eric+marge@anholt.net>

Mon, 30 Mar 2020 11:05:52 +0000 (11:05 +0000)
author Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Fri, 27 Mar 2020 18:32:38 +0000 (19:32 +0100)
committer Marge Bot <eric+marge@anholt.net>
Mon, 30 Mar 2020 11:05:52 +0000 (11:05 +0000)
diff --git a/src/gallium/drivers/radeonsi/.editorconfig b/src/gallium/drivers/radeonsi/.editorconfig

deleted file mode 100644 (file)

index 21a3c7d..0000000
--- a/src/gallium/drivers/radeonsi/.editorconfig
+++ /dev/null
@@ -1,3 +0,0 @@
-[*.{c,h}]
-indent_style = tab
-indent_size = tab
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c

index df8a2fcd577f9fac92e4ef7d4349645c4e70b501..74c289b01349bce4329d249bb6c03f49fe98cd86 100644 (file)
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -23,643 +23,531 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "sid.h"
  #include "si_pipe.h"
+#include "sid.h"
  
  static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w)
  {
-       width = u_minify(width, level);
-       return DIV_ROUND_UP(width, blk_w);
+   width = u_minify(width, level);
+   return DIV_ROUND_UP(width, blk_w);
  }
  
-static unsigned encode_tile_info(struct si_context *sctx,
-                                struct si_texture *tex, unsigned level,
-                                bool set_bpp)
+static unsigned encode_tile_info(struct si_context *sctx, struct si_texture *tex, unsigned level,
+                                 bool set_bpp)
  {
-       struct radeon_info *info = &sctx->screen->info;
-       unsigned tile_index = tex->surface.u.legacy.tiling_index[level];
-       unsigned macro_tile_index = tex->surface.u.legacy.macro_tile_index;
-       unsigned tile_mode = info->si_tile_mode_array[tile_index];
-       unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index];
-
-       return (set_bpp ? util_logbase2(tex->surface.bpe) : 0) |
-               (G_009910_ARRAY_MODE(tile_mode) << 3) |
-               (G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) |
-               /* Non-depth modes don't have TILE_SPLIT set. */
-               ((util_logbase2(tex->surface.u.legacy.tile_split >> 6)) << 11) |
-               (G_009990_BANK_WIDTH(macro_tile_mode) << 15) |
-               (G_009990_BANK_HEIGHT(macro_tile_mode) << 18) |
-               (G_009990_NUM_BANKS(macro_tile_mode) << 21) |
-               (G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) |
-               (G_009910_PIPE_CONFIG(tile_mode) << 26);
+   struct radeon_info *info = &sctx->screen->info;
+   unsigned tile_index = tex->surface.u.legacy.tiling_index[level];
+   unsigned macro_tile_index = tex->surface.u.legacy.macro_tile_index;
+   unsigned tile_mode = info->si_tile_mode_array[tile_index];
+   unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index];
+
+   return (set_bpp ? util_logbase2(tex->surface.bpe) : 0) | (G_009910_ARRAY_MODE(tile_mode) << 3) |
+          (G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) |
+          /* Non-depth modes don't have TILE_SPLIT set. */
+          ((util_logbase2(tex->surface.u.legacy.tile_split >> 6)) << 11) |
+          (G_009990_BANK_WIDTH(macro_tile_mode) << 15) |
+          (G_009990_BANK_HEIGHT(macro_tile_mode) << 18) |
+          (G_009990_NUM_BANKS(macro_tile_mode) << 21) |
+          (G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) |
+          (G_009910_PIPE_CONFIG(tile_mode) << 26);
  }
  
-
-static bool si_sdma_v4_copy_texture(struct si_context *sctx,
-                                 struct pipe_resource *dst,
-                                 unsigned dst_level,
-                                 unsigned dstx, unsigned dsty, unsigned dstz,
-                                 struct pipe_resource *src,
-                                 unsigned src_level,
-                                 const struct pipe_box *src_box)
+static bool si_sdma_v4_copy_texture(struct si_context *sctx, struct pipe_resource *dst,
+                                    unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
+                                    struct pipe_resource *src, unsigned src_level,
+                                    const struct pipe_box *src_box)
  {
-       struct si_texture *ssrc = (struct si_texture*)src;
-       struct si_texture *sdst = (struct si_texture*)dst;
-
-       unsigned bpp = sdst->surface.bpe;
-       uint64_t dst_address = sdst->buffer.gpu_address +
-               sdst->surface.u.gfx9.surf_offset;
-       uint64_t src_address = ssrc->buffer.gpu_address +
-               ssrc->surface.u.gfx9.surf_offset;
-       unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch;
-       unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch;
-       uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.gfx9.surf_slice_size) / bpp;
-       uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.gfx9.surf_slice_size) / bpp;
-       unsigned srcx = src_box->x / ssrc->surface.blk_w;
-       unsigned srcy = src_box->y / ssrc->surface.blk_h;
-       unsigned srcz = src_box->z;
-       unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
-       unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
-       unsigned copy_depth = src_box->depth;
-       unsigned xalign = MAX2(1, 4 / bpp);
-
-       assert(src_level <= src->last_level);
-       assert(dst_level <= dst->last_level);
-       assert(sdst->surface.u.gfx9.surf_offset +
-              dst_slice_pitch * bpp * (dstz + src_box->depth) <=
-              sdst->buffer.buf->size);
-       assert(ssrc->surface.u.gfx9.surf_offset +
-              src_slice_pitch * bpp * (srcz + src_box->depth) <=
-              ssrc->buffer.buf->size);
-
-       if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty,
-                                    dstz, ssrc, src_level, src_box))
-               return false;
-
-       dstx /= sdst->surface.blk_w;
-       dsty /= sdst->surface.blk_h;
-
-       if (srcx >= (1 << 14) ||
-           srcy >= (1 << 14) ||
-           srcz >= (1 << 11) ||
-           dstx >= (1 << 14) ||
-           dsty >= (1 << 14) ||
-           dstz >= (1 << 11))
-               return false;
-
-       /* Linear -> linear sub-window copy. */
-       if (ssrc->surface.is_linear &&
-           sdst->surface.is_linear) {
-               struct radeon_cmdbuf *cs = sctx->sdma_cs;
-
-               /* Check if everything fits into the bitfields */
-               if (!(src_pitch <= (1 << 19) &&
-                     dst_pitch <= (1 << 19) &&
-                     src_slice_pitch <= (1 << 28) &&
-                     dst_slice_pitch <= (1 << 28) &&
-                     copy_width <= (1 << 14) &&
-                     copy_height <= (1 << 14) &&
-                     copy_depth <= (1 << 11)))
-                       return false;
-
-               si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
-
-               src_address += ssrc->surface.u.gfx9.offset[src_level];
-               dst_address += sdst->surface.u.gfx9.offset[dst_level];
-
-               /* Check alignments */
-               if ((src_address % 4) != 0 ||
-                   (dst_address % 4) != 0 ||
-                   (src_pitch % xalign) != 0)
-                       return false;
-
-               radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
-                                               CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
-                           (util_logbase2(bpp) << 29));
-               radeon_emit(cs, src_address);
-               radeon_emit(cs, src_address >> 32);
-               radeon_emit(cs, srcx | (srcy << 16));
-               radeon_emit(cs, srcz | ((src_pitch - 1) << 13));
-               radeon_emit(cs, src_slice_pitch - 1);
-               radeon_emit(cs, dst_address);
-               radeon_emit(cs, dst_address >> 32);
-               radeon_emit(cs, dstx | (dsty << 16));
-               radeon_emit(cs, dstz | ((dst_pitch - 1) << 13));
-               radeon_emit(cs, dst_slice_pitch - 1);
-               radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
-               radeon_emit(cs, (copy_depth - 1));
-               return true;
-       }
-
-       /* Linear <-> Tiled sub-window copy */
-       if (ssrc->surface.is_linear != sdst->surface.is_linear) {
-               struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc;
-               struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
-               unsigned tiled_level =  tiled   == ssrc ? src_level : dst_level;
-               unsigned linear_level = linear  == ssrc ? src_level : dst_level;
-               unsigned tiled_x =      tiled   == ssrc ? srcx : dstx;
-               unsigned linear_x =     linear  == ssrc ? srcx : dstx;
-               unsigned tiled_y =      tiled   == ssrc ? srcy : dsty;
-               unsigned linear_y =     linear  == ssrc ? srcy : dsty;
-               unsigned tiled_z =      tiled   == ssrc ? srcz : dstz;
-               unsigned linear_z =     linear  == ssrc ? srcz : dstz;
-               unsigned tiled_width = tiled == ssrc ?
-                       DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w) :
-                       DIV_ROUND_UP(sdst->buffer.b.b.width0, sdst->surface.blk_w);
-               unsigned tiled_height = tiled == ssrc ?
-                       DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h) :
-                       DIV_ROUND_UP(sdst->buffer.b.b.height0, sdst->surface.blk_h);
-               unsigned tiled_depth =  tiled   == ssrc ?
-                       ssrc->buffer.b.b.depth0 :
-                       sdst->buffer.b.b.depth0;
-               unsigned linear_pitch = linear  == ssrc ? src_pitch : dst_pitch;
-               unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
-               uint64_t tiled_address =  tiled  == ssrc ? src_address : dst_address;
-               uint64_t linear_address = linear == ssrc ? src_address : dst_address;
-               struct radeon_cmdbuf *cs = sctx->sdma_cs;
-
-               linear_address += linear->surface.u.gfx9.offset[linear_level];
-
-               /* Check if everything fits into the bitfields */
-               if (!(tiled_x <= (1 << 14) &&
-                     tiled_y <= (1 << 14) &&
-                     tiled_z <= (1 << 11) &&
-                     tiled_width <= (1 << 14) &&
-                     tiled_height <= (1 << 14) &&
-                     tiled_depth <= (1 << 11) &&
-                     tiled->surface.u.gfx9.surf.epitch <= (1 << 16) &&
-                     linear_x <= (1 << 14) &&
-                     linear_y <= (1 << 14) &&
-                     linear_z <= (1 << 11) &&
-                     linear_pitch <= (1 << 14) &&
-                     linear_slice_pitch <= (1 << 28) &&
-                     copy_width <= (1 << 14) &&
-                     copy_height <= (1 << 14) &&
-                     copy_depth <= (1 << 11)))
-                       return false;
-
-               /* Check alignments */
-               if ((tiled_address % 256 != 0) ||
-                   (linear_address % 4 != 0) ||
-                   (linear_pitch % xalign != 0) ||
-                   (linear_slice_pitch % xalign != 0))
-                       return false;
-
-               si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
-
-               radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
-                                               CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
-                                               tiled->buffer.b.b.last_level << 20 |
-                                               tiled_level << 24 |
-                                               (linear == sdst ? 1u : 0) << 31);
-               radeon_emit(cs, (uint32_t) tiled_address);
-               radeon_emit(cs, (uint32_t) (tiled_address >> 32));
-               radeon_emit(cs, tiled_x | (tiled_y << 16));
-               radeon_emit(cs, tiled_z | ((tiled_width - 1) << 16));
-               radeon_emit(cs, (tiled_height - 1) | (tiled_depth - 1) << 16);
-               radeon_emit(cs, util_logbase2(bpp) |
-                               tiled->surface.u.gfx9.surf.swizzle_mode << 3 |
-                               tiled->surface.u.gfx9.resource_type << 9 |
-                               tiled->surface.u.gfx9.surf.epitch << 16);
-               radeon_emit(cs, (uint32_t) linear_address);
-               radeon_emit(cs, (uint32_t) (linear_address >> 32));
-               radeon_emit(cs, linear_x | (linear_y << 16));
-               radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
-               radeon_emit(cs, linear_slice_pitch - 1);
-               radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
-               radeon_emit(cs, (copy_depth - 1));
-               return true;
-       }
-
-       return false;
+   struct si_texture *ssrc = (struct si_texture *)src;
+   struct si_texture *sdst = (struct si_texture *)dst;
+
+   unsigned bpp = sdst->surface.bpe;
+   uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset;
+   uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.gfx9.surf_offset;
+   unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch;
+   unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch;
+   uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.gfx9.surf_slice_size) / bpp;
+   uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.gfx9.surf_slice_size) / bpp;
+   unsigned srcx = src_box->x / ssrc->surface.blk_w;
+   unsigned srcy = src_box->y / ssrc->surface.blk_h;
+   unsigned srcz = src_box->z;
+   unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
+   unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
+   unsigned copy_depth = src_box->depth;
+   unsigned xalign = MAX2(1, 4 / bpp);
+
+   assert(src_level <= src->last_level);
+   assert(dst_level <= dst->last_level);
+   assert(sdst->surface.u.gfx9.surf_offset + dst_slice_pitch * bpp * (dstz + src_box->depth) <=
+          sdst->buffer.buf->size);
+   assert(ssrc->surface.u.gfx9.surf_offset + src_slice_pitch * bpp * (srcz + src_box->depth) <=
+          ssrc->buffer.buf->size);
+
+   if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, dstz, ssrc, src_level, src_box))
+      return false;
+
+   dstx /= sdst->surface.blk_w;
+   dsty /= sdst->surface.blk_h;
+
+   if (srcx >= (1 << 14) || srcy >= (1 << 14) || srcz >= (1 << 11) || dstx >= (1 << 14) ||
+       dsty >= (1 << 14) || dstz >= (1 << 11))
+      return false;
+
+   /* Linear -> linear sub-window copy. */
+   if (ssrc->surface.is_linear && sdst->surface.is_linear) {
+      struct radeon_cmdbuf *cs = sctx->sdma_cs;
+
+      /* Check if everything fits into the bitfields */
+      if (!(src_pitch <= (1 << 19) && dst_pitch <= (1 << 19) && src_slice_pitch <= (1 << 28) &&
+            dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) &&
+            copy_depth <= (1 << 11)))
+         return false;
+
+      si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
+
+      src_address += ssrc->surface.u.gfx9.offset[src_level];
+      dst_address += sdst->surface.u.gfx9.offset[dst_level];
+
+      /* Check alignments */
+      if ((src_address % 4) != 0 || (dst_address % 4) != 0 || (src_pitch % xalign) != 0)
+         return false;
+
+      radeon_emit(
+         cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
+                (util_logbase2(bpp) << 29));
+      radeon_emit(cs, src_address);
+      radeon_emit(cs, src_address >> 32);
+      radeon_emit(cs, srcx | (srcy << 16));
+      radeon_emit(cs, srcz | ((src_pitch - 1) << 13));
+      radeon_emit(cs, src_slice_pitch - 1);
+      radeon_emit(cs, dst_address);
+      radeon_emit(cs, dst_address >> 32);
+      radeon_emit(cs, dstx | (dsty << 16));
+      radeon_emit(cs, dstz | ((dst_pitch - 1) << 13));
+      radeon_emit(cs, dst_slice_pitch - 1);
+      radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
+      radeon_emit(cs, (copy_depth - 1));
+      return true;
+   }
+
+   /* Linear <-> Tiled sub-window copy */
+   if (ssrc->surface.is_linear != sdst->surface.is_linear) {
+      struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc;
+      struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
+      unsigned tiled_level = tiled == ssrc ? src_level : dst_level;
+      unsigned linear_level = linear == ssrc ? src_level : dst_level;
+      unsigned tiled_x = tiled == ssrc ? srcx : dstx;
+      unsigned linear_x = linear == ssrc ? srcx : dstx;
+      unsigned tiled_y = tiled == ssrc ? srcy : dsty;
+      unsigned linear_y = linear == ssrc ? srcy : dsty;
+      unsigned tiled_z = tiled == ssrc ? srcz : dstz;
+      unsigned linear_z = linear == ssrc ? srcz : dstz;
+      unsigned tiled_width = tiled == ssrc
+                                ? DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w)
+                                : DIV_ROUND_UP(sdst->buffer.b.b.width0, sdst->surface.blk_w);
+      unsigned tiled_height = tiled == ssrc
+                                 ? DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h)
+                                 : DIV_ROUND_UP(sdst->buffer.b.b.height0, sdst->surface.blk_h);
+      unsigned tiled_depth = tiled == ssrc ? ssrc->buffer.b.b.depth0 : sdst->buffer.b.b.depth0;
+      unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
+      unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
+      uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
+      uint64_t linear_address = linear == ssrc ? src_address : dst_address;
+      struct radeon_cmdbuf *cs = sctx->sdma_cs;
+
+      linear_address += linear->surface.u.gfx9.offset[linear_level];
+
+      /* Check if everything fits into the bitfields */
+      if (!(tiled_x <= (1 << 14) && tiled_y <= (1 << 14) && tiled_z <= (1 << 11) &&
+            tiled_width <= (1 << 14) && tiled_height <= (1 << 14) && tiled_depth <= (1 << 11) &&
+            tiled->surface.u.gfx9.surf.epitch <= (1 << 16) && linear_x <= (1 << 14) &&
+            linear_y <= (1 << 14) && linear_z <= (1 << 11) && linear_pitch <= (1 << 14) &&
+            linear_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) &&
+            copy_height <= (1 << 14) && copy_depth <= (1 << 11)))
+         return false;
+
+      /* Check alignments */
+      if ((tiled_address % 256 != 0) || (linear_address % 4 != 0) || (linear_pitch % xalign != 0) ||
+          (linear_slice_pitch % xalign != 0))
+         return false;
+
+      si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
+
+      radeon_emit(
+         cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
+                tiled->buffer.b.b.last_level << 20 | tiled_level << 24 |
+                (linear == sdst ? 1u : 0) << 31);
+      radeon_emit(cs, (uint32_t)tiled_address);
+      radeon_emit(cs, (uint32_t)(tiled_address >> 32));
+      radeon_emit(cs, tiled_x | (tiled_y << 16));
+      radeon_emit(cs, tiled_z | ((tiled_width - 1) << 16));
+      radeon_emit(cs, (tiled_height - 1) | (tiled_depth - 1) << 16);
+      radeon_emit(cs, util_logbase2(bpp) | tiled->surface.u.gfx9.surf.swizzle_mode << 3 |
+                         tiled->surface.u.gfx9.resource_type << 9 |
+                         tiled->surface.u.gfx9.surf.epitch << 16);
+      radeon_emit(cs, (uint32_t)linear_address);
+      radeon_emit(cs, (uint32_t)(linear_address >> 32));
+      radeon_emit(cs, linear_x | (linear_y << 16));
+      radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
+      radeon_emit(cs, linear_slice_pitch - 1);
+      radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
+      radeon_emit(cs, (copy_depth - 1));
+      return true;
+   }
+
+   return false;
  }
  
-static bool cik_sdma_copy_texture(struct si_context *sctx,
-                                 struct pipe_resource *dst,
-                                 unsigned dst_level,
-                                 unsigned dstx, unsigned dsty, unsigned dstz,
-                                 struct pipe_resource *src,
-                                 unsigned src_level,
-                                 const struct pipe_box *src_box)
+static bool cik_sdma_copy_texture(struct si_context *sctx, struct pipe_resource *dst,
+                                  unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
+                                  struct pipe_resource *src, unsigned src_level,
+                                  const struct pipe_box *src_box)
  {
-       struct radeon_info *info = &sctx->screen->info;
-       struct si_texture *ssrc = (struct si_texture*)src;
-       struct si_texture *sdst = (struct si_texture*)dst;
-       unsigned bpp = sdst->surface.bpe;
-       uint64_t dst_address = sdst->buffer.gpu_address +
-                              sdst->surface.u.legacy.level[dst_level].offset;
-       uint64_t src_address = ssrc->buffer.gpu_address +
-                              ssrc->surface.u.legacy.level[src_level].offset;
-       unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode;
-       unsigned src_mode = ssrc->surface.u.legacy.level[src_level].mode;
-       unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[dst_level];
-       unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[src_level];
-       unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index];
-       unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index];
-       unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode);
-       unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode);
-       unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ?
-                                           sdst->surface.tile_swizzle : 0;
-       unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ?
-                                           ssrc->surface.tile_swizzle : 0;
-       unsigned dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x;
-       unsigned src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x;
-       uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4) / bpp;
-       uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4) / bpp;
-       unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0,
-                                             dst_level, sdst->surface.blk_w);
-       unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0,
-                                             src_level, ssrc->surface.blk_w);
-       unsigned dst_height = minify_as_blocks(sdst->buffer.b.b.height0,
-                                              dst_level, sdst->surface.blk_h);
-       unsigned src_height = minify_as_blocks(ssrc->buffer.b.b.height0,
-                                              src_level, ssrc->surface.blk_h);
-       unsigned srcx = src_box->x / ssrc->surface.blk_w;
-       unsigned srcy = src_box->y / ssrc->surface.blk_h;
-       unsigned srcz = src_box->z;
-       unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
-       unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
-       unsigned copy_depth = src_box->depth;
-
-       assert(src_level <= src->last_level);
-       assert(dst_level <= dst->last_level);
-       assert(sdst->surface.u.legacy.level[dst_level].offset +
-              dst_slice_pitch * bpp * (dstz + src_box->depth) <=
-              sdst->buffer.buf->size);
-       assert(ssrc->surface.u.legacy.level[src_level].offset +
-              src_slice_pitch * bpp * (srcz + src_box->depth) <=
-              ssrc->buffer.buf->size);
-
-       if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty,
-                                    dstz, ssrc, src_level, src_box))
-               return false;
-
-       dstx /= sdst->surface.blk_w;
-       dsty /= sdst->surface.blk_h;
-
-       if (srcx >= (1 << 14) ||
-           srcy >= (1 << 14) ||
-           srcz >= (1 << 11) ||
-           dstx >= (1 << 14) ||
-           dsty >= (1 << 14) ||
-           dstz >= (1 << 11))
-               return false;
-
-       dst_address |= dst_tile_swizzle << 8;
-       src_address |= src_tile_swizzle << 8;
-
-       /* Linear -> linear sub-window copy. */
-       if (dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
-           src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
-           /* check if everything fits into the bitfields */
-           src_pitch <= (1 << 14) &&
-           dst_pitch <= (1 << 14) &&
-           src_slice_pitch <= (1 << 28) &&
-           dst_slice_pitch <= (1 << 28) &&
-           copy_width <= (1 << 14) &&
-           copy_height <= (1 << 14) &&
-           copy_depth <= (1 << 11) &&
-           /* HW limitation - GFX7: */
-           (sctx->chip_class != GFX7 ||
-            (copy_width < (1 << 14) &&
-             copy_height < (1 << 14) &&
-             copy_depth < (1 << 11))) &&
-           /* HW limitation - some GFX7 parts: */
-           ((sctx->family != CHIP_BONAIRE &&
-             sctx->family != CHIP_KAVERI) ||
-            (srcx + copy_width != (1 << 14) &&
-             srcy + copy_height != (1 << 14)))) {
-               struct radeon_cmdbuf *cs = sctx->sdma_cs;
-
-               si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
-
-               radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
-                                               CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
-                           (util_logbase2(bpp) << 29));
-               radeon_emit(cs, src_address);
-               radeon_emit(cs, src_address >> 32);
-               radeon_emit(cs, srcx | (srcy << 16));
-               radeon_emit(cs, srcz | ((src_pitch - 1) << 16));
-               radeon_emit(cs, src_slice_pitch - 1);
-               radeon_emit(cs, dst_address);
-               radeon_emit(cs, dst_address >> 32);
-               radeon_emit(cs, dstx | (dsty << 16));
-               radeon_emit(cs, dstz | ((dst_pitch - 1) << 16));
-               radeon_emit(cs, dst_slice_pitch - 1);
-               if (sctx->chip_class == GFX7) {
-                       radeon_emit(cs, copy_width | (copy_height << 16));
-                       radeon_emit(cs, copy_depth);
-               } else {
-                       radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
-                       radeon_emit(cs, (copy_depth - 1));
-               }
-               return true;
-       }
-
-       /* Tiled <-> linear sub-window copy. */
-       if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) {
-               struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst;
-               struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
-               unsigned tiled_level =  tiled   == ssrc ? src_level : dst_level;
-               unsigned linear_level = linear  == ssrc ? src_level : dst_level;
-               unsigned tiled_x =      tiled   == ssrc ? srcx : dstx;
-               unsigned linear_x =     linear  == ssrc ? srcx : dstx;
-               unsigned tiled_y =      tiled   == ssrc ? srcy : dsty;
-               unsigned linear_y =     linear  == ssrc ? srcy : dsty;
-               unsigned tiled_z =      tiled   == ssrc ? srcz : dstz;
-               unsigned linear_z =     linear  == ssrc ? srcz : dstz;
-               unsigned tiled_width =  tiled   == ssrc ? src_width : dst_width;
-               unsigned linear_width = linear  == ssrc ? src_width : dst_width;
-               unsigned tiled_pitch =  tiled   == ssrc ? src_pitch : dst_pitch;
-               unsigned linear_pitch = linear  == ssrc ? src_pitch : dst_pitch;
-               unsigned tiled_slice_pitch  = tiled  == ssrc ? src_slice_pitch : dst_slice_pitch;
-               unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
-               uint64_t tiled_address =  tiled  == ssrc ? src_address : dst_address;
-               uint64_t linear_address = linear == ssrc ? src_address : dst_address;
-               unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode;
-
-               assert(tiled_pitch % 8 == 0);
-               assert(tiled_slice_pitch % 64 == 0);
-               unsigned pitch_tile_max = tiled_pitch / 8 - 1;
-               unsigned slice_tile_max = tiled_slice_pitch / 64 - 1;
-               unsigned xalign = MAX2(1, 4 / bpp);
-               unsigned copy_width_aligned = copy_width;
-
-               /* If the region ends at the last pixel and is unaligned, we
-                * can copy the remainder of the line that is not visible to
-                * make it aligned.
-                */
-               if (copy_width % xalign != 0 &&
-                   linear_x + copy_width == linear_width &&
-                   tiled_x  + copy_width == tiled_width &&
-                   linear_x + align(copy_width, xalign) <= linear_pitch &&
-                   tiled_x  + align(copy_width, xalign) <= tiled_pitch)
-                       copy_width_aligned = align(copy_width, xalign);
-
-               /* HW limitations. */
-               if ((sctx->family == CHIP_BONAIRE ||
-                    sctx->family == CHIP_KAVERI) &&
-                   linear_pitch - 1 == 0x3fff &&
-                   bpp == 16)
-                       return false;
-
-               if (sctx->chip_class == GFX7 &&
-                   (copy_width_aligned == (1 << 14) ||
-                    copy_height == (1 << 14) ||
-                    copy_depth == (1 << 11)))
-                       return false;
-
-               if ((sctx->family == CHIP_BONAIRE ||
-                    sctx->family == CHIP_KAVERI ||
-                    sctx->family == CHIP_KABINI) &&
-                   (tiled_x + copy_width == (1 << 14) ||
-                    tiled_y + copy_height == (1 << 14)))
-                       return false;
-
-               /* The hw can read outside of the given linear buffer bounds,
-                * or access those pages but not touch the memory in case
-                * of writes. (it still causes a VM fault)
-                *
-                * Out-of-bounds memory access or page directory access must
-                * be prevented.
-                */
-               int64_t start_linear_address, end_linear_address;
-               unsigned granularity;
-
-               /* Deduce the size of reads from the linear surface. */
-               switch (tiled_micro_mode) {
-               case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING:
-                       granularity = bpp == 1 ? 64 / (8*bpp) :
-                                                128 / (8*bpp);
-                       break;
-               case V_009910_ADDR_SURF_THIN_MICRO_TILING:
-               case V_009910_ADDR_SURF_DEPTH_MICRO_TILING:
-                       if (0 /* TODO: THICK microtiling */)
-                               granularity = bpp == 1 ? 32 / (8*bpp) :
-                                             bpp == 2 ? 64 / (8*bpp) :
-                                             bpp <= 8 ? 128 / (8*bpp) :
-                                                        256 / (8*bpp);
-                       else
-                               granularity = bpp <= 2 ? 64 / (8*bpp) :
-                                             bpp <= 8 ? 128 / (8*bpp) :
-                                                        256 / (8*bpp);
-                       break;
-               default:
-                       return false;
-               }
-
-               /* The linear reads start at tiled_x & ~(granularity - 1).
-                * If linear_x == 0 && tiled_x % granularity != 0, the hw
-                * starts reading from an address preceding linear_address!!!
-                */
-               start_linear_address =
-                       linear->surface.u.legacy.level[linear_level].offset +
-                       bpp * (linear_z * linear_slice_pitch +
-                              linear_y * linear_pitch +
-                              linear_x);
-               start_linear_address -= (int)(bpp * (tiled_x % granularity));
-
-               end_linear_address =
-                       linear->surface.u.legacy.level[linear_level].offset +
-                       bpp * ((linear_z + copy_depth - 1) * linear_slice_pitch +
-                              (linear_y + copy_height - 1) * linear_pitch +
-                              (linear_x + copy_width));
-
-               if ((tiled_x + copy_width) % granularity)
-                       end_linear_address += granularity -
-                                             (tiled_x + copy_width) % granularity;
-
-               if (start_linear_address < 0 ||
-                   end_linear_address > linear->surface.surf_size)
-                       return false;
-
-               /* Check requirements. */
-               if (tiled_address % 256 == 0 &&
-                   linear_address % 4 == 0 &&
-                   linear_pitch % xalign == 0 &&
-                   linear_x % xalign == 0 &&
-                   tiled_x % xalign == 0 &&
-                   copy_width_aligned % xalign == 0 &&
-                   tiled_micro_mode != V_009910_ADDR_SURF_ROTATED_MICRO_TILING &&
-                   /* check if everything fits into the bitfields */
-                   tiled->surface.u.legacy.tile_split <= 4096 &&
-                   pitch_tile_max < (1 << 11) &&
-                   slice_tile_max < (1 << 22) &&
-                   linear_pitch <= (1 << 14) &&
-                   linear_slice_pitch <= (1 << 28) &&
-                   copy_width_aligned <= (1 << 14) &&
-                   copy_height <= (1 << 14) &&
-                   copy_depth <= (1 << 11)) {
-                       struct radeon_cmdbuf *cs = sctx->sdma_cs;
-                       uint32_t direction = linear == sdst ? 1u << 31 : 0;
-
-                       si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
-
-                       radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
-                                                       CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
-                                       direction);
-                       radeon_emit(cs, tiled_address);
-                       radeon_emit(cs, tiled_address >> 32);
-                       radeon_emit(cs, tiled_x | (tiled_y << 16));
-                       radeon_emit(cs, tiled_z | (pitch_tile_max << 16));
-                       radeon_emit(cs, slice_tile_max);
-                       radeon_emit(cs, encode_tile_info(sctx, tiled, tiled_level, true));
-                       radeon_emit(cs, linear_address);
-                       radeon_emit(cs, linear_address >> 32);
-                       radeon_emit(cs, linear_x | (linear_y << 16));
-                       radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
-                       radeon_emit(cs, linear_slice_pitch - 1);
-                       if (sctx->chip_class == GFX7) {
-                               radeon_emit(cs, copy_width_aligned | (copy_height << 16));
-                               radeon_emit(cs, copy_depth);
-                       } else {
-                               radeon_emit(cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16));
-                               radeon_emit(cs, (copy_depth - 1));
-                       }
-                       return true;
-               }
-       }
-
-       /* Tiled -> Tiled sub-window copy. */
-       if (dst_mode >= RADEON_SURF_MODE_1D &&
-           src_mode >= RADEON_SURF_MODE_1D &&
-           /* check if these fit into the bitfields */
-           src_address % 256 == 0 &&
-           dst_address % 256 == 0 &&
-           ssrc->surface.u.legacy.tile_split <= 4096 &&
-           sdst->surface.u.legacy.tile_split <= 4096 &&
-           dstx % 8 == 0 &&
-           dsty % 8 == 0 &&
-           srcx % 8 == 0 &&
-           srcy % 8 == 0 &&
-           /* this can either be equal, or display->rotated (GFX8+ only) */
-           (src_micro_mode == dst_micro_mode ||
-            (sctx->chip_class >= GFX8 &&
-             src_micro_mode == V_009910_ADDR_SURF_DISPLAY_MICRO_TILING &&
-             dst_micro_mode == V_009910_ADDR_SURF_ROTATED_MICRO_TILING))) {
-               assert(src_pitch % 8 == 0);
-               assert(dst_pitch % 8 == 0);
-               assert(src_slice_pitch % 64 == 0);
-               assert(dst_slice_pitch % 64 == 0);
-               unsigned src_pitch_tile_max = src_pitch / 8 - 1;
-               unsigned dst_pitch_tile_max = dst_pitch / 8 - 1;
-               unsigned src_slice_tile_max = src_slice_pitch / 64 - 1;
-               unsigned dst_slice_tile_max = dst_slice_pitch / 64 - 1;
-               unsigned copy_width_aligned = copy_width;
-               unsigned copy_height_aligned = copy_height;
-
-               /* If the region ends at the last pixel and is unaligned, we
-                * can copy the remainder of the tile that is not visible to
-                * make it aligned.
-                */
-               if (copy_width % 8 != 0 &&
-                   srcx + copy_width == src_width &&
-                   dstx + copy_width == dst_width)
-                       copy_width_aligned = align(copy_width, 8);
-
-               if (copy_height % 8 != 0 &&
-                   srcy + copy_height == src_height &&
-                   dsty + copy_height == dst_height)
-                       copy_height_aligned = align(copy_height, 8);
-
-               /* check if these fit into the bitfields */
-               if (src_pitch_tile_max < (1 << 11) &&
-                   dst_pitch_tile_max < (1 << 11) &&
-                   src_slice_tile_max < (1 << 22) &&
-                   dst_slice_tile_max < (1 << 22) &&
-                   copy_width_aligned <= (1 << 14) &&
-                   copy_height_aligned <= (1 << 14) &&
-                   copy_depth <= (1 << 11) &&
-                   copy_width_aligned % 8 == 0 &&
-                   copy_height_aligned % 8 == 0 &&
-                   /* HW limitation - GFX7: */
-                   (sctx->chip_class != GFX7 ||
-                    (copy_width_aligned < (1 << 14) &&
-                     copy_height_aligned < (1 << 14) &&
-                     copy_depth < (1 << 11))) &&
-                   /* HW limitation - some GFX7 parts: */
-                   ((sctx->family != CHIP_BONAIRE &&
-                     sctx->family != CHIP_KAVERI &&
-                     sctx->family != CHIP_KABINI) ||
-                    (srcx + copy_width_aligned != (1 << 14) &&
-                     srcy + copy_height_aligned != (1 << 14) &&
-                     dstx + copy_width != (1 << 14)))) {
-                       struct radeon_cmdbuf *cs = sctx->sdma_cs;
-
-                       si_need_dma_space(sctx, 15, &sdst->buffer, &ssrc->buffer);
-
-                       radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
-                                                       CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW, 0));
-                       radeon_emit(cs, src_address);
-                       radeon_emit(cs, src_address >> 32);
-                       radeon_emit(cs, srcx | (srcy << 16));
-                       radeon_emit(cs, srcz | (src_pitch_tile_max << 16));
-                       radeon_emit(cs, src_slice_tile_max);
-                       radeon_emit(cs, encode_tile_info(sctx, ssrc, src_level, true));
-                       radeon_emit(cs, dst_address);
-                       radeon_emit(cs, dst_address >> 32);
-                       radeon_emit(cs, dstx | (dsty << 16));
-                       radeon_emit(cs, dstz | (dst_pitch_tile_max << 16));
-                       radeon_emit(cs, dst_slice_tile_max);
-                       radeon_emit(cs, encode_tile_info(sctx, sdst, dst_level, false));
-                       if (sctx->chip_class == GFX7) {
-                               radeon_emit(cs, copy_width_aligned |
-                                               (copy_height_aligned << 16));
-                               radeon_emit(cs, copy_depth);
-                       } else {
-                               radeon_emit(cs, (copy_width_aligned - 8) |
-                                               ((copy_height_aligned - 8) << 16));
-                               radeon_emit(cs, (copy_depth - 1));
-                       }
-                       return true;
-               }
-       }
-
-       return false;
+   struct radeon_info *info = &sctx->screen->info;
+   struct si_texture *ssrc = (struct si_texture *)src;
+   struct si_texture *sdst = (struct si_texture *)dst;
+   unsigned bpp = sdst->surface.bpe;
+   uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.legacy.level[dst_level].offset;
+   uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.legacy.level[src_level].offset;
+   unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode;
+   unsigned src_mode = ssrc->surface.u.legacy.level[src_level].mode;
+   unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[dst_level];
+   unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[src_level];
+   unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index];
+   unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index];
+   unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode);
+   unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode);
+   unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ? sdst->surface.tile_swizzle : 0;
+   unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ? ssrc->surface.tile_swizzle : 0;
+   unsigned dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x;
+   unsigned src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x;
+   uint64_t dst_slice_pitch =
+      ((uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4) / bpp;
+   uint64_t src_slice_pitch =
+      ((uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4) / bpp;
+   unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0, dst_level, sdst->surface.blk_w);
+   unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0, src_level, ssrc->surface.blk_w);
+   unsigned dst_height = minify_as_blocks(sdst->buffer.b.b.height0, dst_level, sdst->surface.blk_h);
+   unsigned src_height = minify_as_blocks(ssrc->buffer.b.b.height0, src_level, ssrc->surface.blk_h);
+   unsigned srcx = src_box->x / ssrc->surface.blk_w;
+   unsigned srcy = src_box->y / ssrc->surface.blk_h;
+   unsigned srcz = src_box->z;
+   unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
+   unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
+   unsigned copy_depth = src_box->depth;
+
+   assert(src_level <= src->last_level);
+   assert(dst_level <= dst->last_level);
+   assert(sdst->surface.u.legacy.level[dst_level].offset +
+             dst_slice_pitch * bpp * (dstz + src_box->depth) <=
+          sdst->buffer.buf->size);
+   assert(ssrc->surface.u.legacy.level[src_level].offset +
+             src_slice_pitch * bpp * (srcz + src_box->depth) <=
+          ssrc->buffer.buf->size);
+
+   if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, dstz, ssrc, src_level, src_box))
+      return false;
+
+   dstx /= sdst->surface.blk_w;
+   dsty /= sdst->surface.blk_h;
+
+   if (srcx >= (1 << 14) || srcy >= (1 << 14) || srcz >= (1 << 11) || dstx >= (1 << 14) ||
+       dsty >= (1 << 14) || dstz >= (1 << 11))
+      return false;
+
+   dst_address |= dst_tile_swizzle << 8;
+   src_address |= src_tile_swizzle << 8;
+
+   /* Linear -> linear sub-window copy. */
+   if (dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED && src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
+       /* check if everything fits into the bitfields */
+       src_pitch <= (1 << 14) && dst_pitch <= (1 << 14) && src_slice_pitch <= (1 << 28) &&
+       dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) &&
+       copy_depth <= (1 << 11) &&
+       /* HW limitation - GFX7: */
+       (sctx->chip_class != GFX7 ||
+        (copy_width < (1 << 14) && copy_height < (1 << 14) && copy_depth < (1 << 11))) &&
+       /* HW limitation - some GFX7 parts: */
+       ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI) ||
+        (srcx + copy_width != (1 << 14) && srcy + copy_height != (1 << 14)))) {
+      struct radeon_cmdbuf *cs = sctx->sdma_cs;
+
+      si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
+
+      radeon_emit(
+         cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
+                (util_logbase2(bpp) << 29));
+      radeon_emit(cs, src_address);
+      radeon_emit(cs, src_address >> 32);
+      radeon_emit(cs, srcx | (srcy << 16));
+      radeon_emit(cs, srcz | ((src_pitch - 1) << 16));
+      radeon_emit(cs, src_slice_pitch - 1);
+      radeon_emit(cs, dst_address);
+      radeon_emit(cs, dst_address >> 32);
+      radeon_emit(cs, dstx | (dsty << 16));
+      radeon_emit(cs, dstz | ((dst_pitch - 1) << 16));
+      radeon_emit(cs, dst_slice_pitch - 1);
+      if (sctx->chip_class == GFX7) {
+         radeon_emit(cs, copy_width | (copy_height << 16));
+         radeon_emit(cs, copy_depth);
+      } else {
+         radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
+         radeon_emit(cs, (copy_depth - 1));
+      }
+      return true;
+   }
+
+   /* Tiled <-> linear sub-window copy. */
+   if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) {
+      struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst;
+      struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
+      unsigned tiled_level = tiled == ssrc ? src_level : dst_level;
+      unsigned linear_level = linear == ssrc ? src_level : dst_level;
+      unsigned tiled_x = tiled == ssrc ? srcx : dstx;
+      unsigned linear_x = linear == ssrc ? srcx : dstx;
+      unsigned tiled_y = tiled == ssrc ? srcy : dsty;
+      unsigned linear_y = linear == ssrc ? srcy : dsty;
+      unsigned tiled_z = tiled == ssrc ? srcz : dstz;
+      unsigned linear_z = linear == ssrc ? srcz : dstz;
+      unsigned tiled_width = tiled == ssrc ? src_width : dst_width;
+      unsigned linear_width = linear == ssrc ? src_width : dst_width;
+      unsigned tiled_pitch = tiled == ssrc ? src_pitch : dst_pitch;
+      unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
+      unsigned tiled_slice_pitch = tiled == ssrc ? src_slice_pitch : dst_slice_pitch;
+      unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
+      uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
+      uint64_t linear_address = linear == ssrc ? src_address : dst_address;
+      unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode;
+
+      assert(tiled_pitch % 8 == 0);
+      assert(tiled_slice_pitch % 64 == 0);
+      unsigned pitch_tile_max = tiled_pitch / 8 - 1;
+      unsigned slice_tile_max = tiled_slice_pitch / 64 - 1;
+      unsigned xalign = MAX2(1, 4 / bpp);
+      unsigned copy_width_aligned = copy_width;
+
+      /* If the region ends at the last pixel and is unaligned, we
+       * can copy the remainder of the line that is not visible to
+       * make it aligned.
+       */
+      if (copy_width % xalign != 0 && linear_x + copy_width == linear_width &&
+          tiled_x + copy_width == tiled_width &&
+          linear_x + align(copy_width, xalign) <= linear_pitch &&
+          tiled_x + align(copy_width, xalign) <= tiled_pitch)
+         copy_width_aligned = align(copy_width, xalign);
+
+      /* HW limitations. */
+      if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI) &&
+          linear_pitch - 1 == 0x3fff && bpp == 16)
+         return false;
+
+      if (sctx->chip_class == GFX7 &&
+          (copy_width_aligned == (1 << 14) || copy_height == (1 << 14) || copy_depth == (1 << 11)))
+         return false;
+
+      if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI ||
+           sctx->family == CHIP_KABINI) &&
+          (tiled_x + copy_width == (1 << 14) || tiled_y + copy_height == (1 << 14)))
+         return false;
+
+      /* The hw can read outside of the given linear buffer bounds,
+       * or access those pages but not touch the memory in case
+       * of writes. (it still causes a VM fault)
+       *
+       * Out-of-bounds memory access or page directory access must
+       * be prevented.
+       */
+      int64_t start_linear_address, end_linear_address;
+      unsigned granularity;
+
+      /* Deduce the size of reads from the linear surface. */
+      switch (tiled_micro_mode) {
+      case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING:
+         granularity = bpp == 1 ? 64 / (8 * bpp) : 128 / (8 * bpp);
+         break;
+      case V_009910_ADDR_SURF_THIN_MICRO_TILING:
+      case V_009910_ADDR_SURF_DEPTH_MICRO_TILING:
+         if (0 /* TODO: THICK microtiling */)
+            granularity =
+               bpp == 1 ? 32 / (8 * bpp)
+                        : bpp == 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
+         else
+            granularity = bpp <= 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
+         break;
+      default:
+         return false;
+      }
+
+      /* The linear reads start at tiled_x & ~(granularity - 1).
+       * If linear_x == 0 && tiled_x % granularity != 0, the hw
+       * starts reading from an address preceding linear_address!!!
+       */
+      start_linear_address =
+         linear->surface.u.legacy.level[linear_level].offset +
+         bpp * (linear_z * linear_slice_pitch + linear_y * linear_pitch + linear_x);
+      start_linear_address -= (int)(bpp * (tiled_x % granularity));
+
+      end_linear_address =
+         linear->surface.u.legacy.level[linear_level].offset +
+         bpp * ((linear_z + copy_depth - 1) * linear_slice_pitch +
+                (linear_y + copy_height - 1) * linear_pitch + (linear_x + copy_width));
+
+      if ((tiled_x + copy_width) % granularity)
+         end_linear_address += granularity - (tiled_x + copy_width) % granularity;
+
+      if (start_linear_address < 0 || end_linear_address > linear->surface.surf_size)
+         return false;
+
+      /* Check requirements. */
+      if (tiled_address % 256 == 0 && linear_address % 4 == 0 && linear_pitch % xalign == 0 &&
+          linear_x % xalign == 0 && tiled_x % xalign == 0 && copy_width_aligned % xalign == 0 &&
+          tiled_micro_mode != V_009910_ADDR_SURF_ROTATED_MICRO_TILING &&
+          /* check if everything fits into the bitfields */
+          tiled->surface.u.legacy.tile_split <= 4096 && pitch_tile_max < (1 << 11) &&
+          slice_tile_max < (1 << 22) && linear_pitch <= (1 << 14) &&
+          linear_slice_pitch <= (1 << 28) && copy_width_aligned <= (1 << 14) &&
+          copy_height <= (1 << 14) && copy_depth <= (1 << 11)) {
+         struct radeon_cmdbuf *cs = sctx->sdma_cs;
+         uint32_t direction = linear == sdst ? 1u << 31 : 0;
+
+         si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
+
+         radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
+                                         CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
+                            direction);
+         radeon_emit(cs, tiled_address);
+         radeon_emit(cs, tiled_address >> 32);
+         radeon_emit(cs, tiled_x | (tiled_y << 16));
+         radeon_emit(cs, tiled_z | (pitch_tile_max << 16));
+         radeon_emit(cs, slice_tile_max);
+         radeon_emit(cs, encode_tile_info(sctx, tiled, tiled_level, true));
+         radeon_emit(cs, linear_address);
+         radeon_emit(cs, linear_address >> 32);
+         radeon_emit(cs, linear_x | (linear_y << 16));
+         radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
+         radeon_emit(cs, linear_slice_pitch - 1);
+         if (sctx->chip_class == GFX7) {
+            radeon_emit(cs, copy_width_aligned | (copy_height << 16));
+            radeon_emit(cs, copy_depth);
+         } else {
+            radeon_emit(cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16));
+            radeon_emit(cs, (copy_depth - 1));
+         }
+         return true;
+      }
+   }
+
+   /* Tiled -> Tiled sub-window copy. */
+   if (dst_mode >= RADEON_SURF_MODE_1D && src_mode >= RADEON_SURF_MODE_1D &&
+       /* check if these fit into the bitfields */
+       src_address % 256 == 0 && dst_address % 256 == 0 &&
+       ssrc->surface.u.legacy.tile_split <= 4096 && sdst->surface.u.legacy.tile_split <= 4096 &&
+       dstx % 8 == 0 && dsty % 8 == 0 && srcx % 8 == 0 && srcy % 8 == 0 &&
+       /* this can either be equal, or display->rotated (GFX8+ only) */
+       (src_micro_mode == dst_micro_mode ||
+        (sctx->chip_class >= GFX8 && src_micro_mode == V_009910_ADDR_SURF_DISPLAY_MICRO_TILING &&
+         dst_micro_mode == V_009910_ADDR_SURF_ROTATED_MICRO_TILING))) {
+      assert(src_pitch % 8 == 0);
+      assert(dst_pitch % 8 == 0);
+      assert(src_slice_pitch % 64 == 0);
+      assert(dst_slice_pitch % 64 == 0);
+      unsigned src_pitch_tile_max = src_pitch / 8 - 1;
+      unsigned dst_pitch_tile_max = dst_pitch / 8 - 1;
+      unsigned src_slice_tile_max = src_slice_pitch / 64 - 1;
+      unsigned dst_slice_tile_max = dst_slice_pitch / 64 - 1;
+      unsigned copy_width_aligned = copy_width;
+      unsigned copy_height_aligned = copy_height;
+
+      /* If the region ends at the last pixel and is unaligned, we
+       * can copy the remainder of the tile that is not visible to
+       * make it aligned.
+       */
+      if (copy_width % 8 != 0 && srcx + copy_width == src_width && dstx + copy_width == dst_width)
+         copy_width_aligned = align(copy_width, 8);
+
+      if (copy_height % 8 != 0 && srcy + copy_height == src_height &&
+          dsty + copy_height == dst_height)
+         copy_height_aligned = align(copy_height, 8);
+
+      /* check if these fit into the bitfields */
+      if (src_pitch_tile_max < (1 << 11) && dst_pitch_tile_max < (1 << 11) &&
+          src_slice_tile_max < (1 << 22) && dst_slice_tile_max < (1 << 22) &&
+          copy_width_aligned <= (1 << 14) && copy_height_aligned <= (1 << 14) &&
+          copy_depth <= (1 << 11) && copy_width_aligned % 8 == 0 && copy_height_aligned % 8 == 0 &&
+          /* HW limitation - GFX7: */
+          (sctx->chip_class != GFX7 ||
+           (copy_width_aligned < (1 << 14) && copy_height_aligned < (1 << 14) &&
+            copy_depth < (1 << 11))) &&
+          /* HW limitation - some GFX7 parts: */
+          ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI &&
+            sctx->family != CHIP_KABINI) ||
+           (srcx + copy_width_aligned != (1 << 14) && srcy + copy_height_aligned != (1 << 14) &&
+            dstx + copy_width != (1 << 14)))) {
+         struct radeon_cmdbuf *cs = sctx->sdma_cs;
+
+         si_need_dma_space(sctx, 15, &sdst->buffer, &ssrc->buffer);
+
+         radeon_emit(
+            cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW, 0));
+         radeon_emit(cs, src_address);
+         radeon_emit(cs, src_address >> 32);
+         radeon_emit(cs, srcx | (srcy << 16));
+         radeon_emit(cs, srcz | (src_pitch_tile_max << 16));
+         radeon_emit(cs, src_slice_tile_max);
+         radeon_emit(cs, encode_tile_info(sctx, ssrc, src_level, true));
+         radeon_emit(cs, dst_address);
+         radeon_emit(cs, dst_address >> 32);
+         radeon_emit(cs, dstx | (dsty << 16));
+         radeon_emit(cs, dstz | (dst_pitch_tile_max << 16));
+         radeon_emit(cs, dst_slice_tile_max);
+         radeon_emit(cs, encode_tile_info(sctx, sdst, dst_level, false));
+         if (sctx->chip_class == GFX7) {
+            radeon_emit(cs, copy_width_aligned | (copy_height_aligned << 16));
+            radeon_emit(cs, copy_depth);
+         } else {
+            radeon_emit(cs, (copy_width_aligned - 8) | ((copy_height_aligned - 8) << 16));
+            radeon_emit(cs, (copy_depth - 1));
+         }
+         return true;
+      }
+   }
+
+   return false;
  }
  
-static void cik_sdma_copy(struct pipe_context *ctx,
-                         struct pipe_resource *dst,
-                         unsigned dst_level,
-                         unsigned dstx, unsigned dsty, unsigned dstz,
-                         struct pipe_resource *src,
-                         unsigned src_level,
-                         const struct pipe_box *src_box)
+static void cik_sdma_copy(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level,
+                          unsigned dstx, unsigned dsty, unsigned dstz, struct pipe_resource *src,
+                          unsigned src_level, const struct pipe_box *src_box)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-
-       assert(src->target != PIPE_BUFFER);
-
-       if (!sctx->sdma_cs ||
-           src->flags & PIPE_RESOURCE_FLAG_SPARSE ||
-           dst->flags & PIPE_RESOURCE_FLAG_SPARSE)
-               goto fallback;
-
-       /* SDMA causes corruption. See:
-        *   https://bugs.freedesktop.org/show_bug.cgi?id=110575
-        *   https://bugs.freedesktop.org/show_bug.cgi?id=110635
-        *
-        * Keep SDMA enabled on APUs.
-        */
-       if (sctx->screen->debug_flags & DBG(FORCE_SDMA) ||
-           (!sctx->screen->info.has_dedicated_vram &&
-            !(sctx->screen->debug_flags & DBG(NO_SDMA_COPY_IMAGE)))) {
-               if ((sctx->chip_class == GFX7 || sctx->chip_class == GFX8) &&
-                   cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz,
-                                         src, src_level, src_box))
-                       return;
-               else if (sctx->chip_class == GFX9 &&
-                        si_sdma_v4_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz,
-                                                src, src_level, src_box))
-                       return;
-       }
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   assert(src->target != PIPE_BUFFER);
+
+   if (!sctx->sdma_cs || src->flags & PIPE_RESOURCE_FLAG_SPARSE ||
+       dst->flags & PIPE_RESOURCE_FLAG_SPARSE)
+      goto fallback;
+
+   /* SDMA causes corruption. See:
+    *   https://bugs.freedesktop.org/show_bug.cgi?id=110575
+    *   https://bugs.freedesktop.org/show_bug.cgi?id=110635
+    *
+    * Keep SDMA enabled on APUs.
+    */
+   if (sctx->screen->debug_flags & DBG(FORCE_SDMA) ||
+       (!sctx->screen->info.has_dedicated_vram &&
+        !(sctx->screen->debug_flags & DBG(NO_SDMA_COPY_IMAGE)))) {
+      if ((sctx->chip_class == GFX7 || sctx->chip_class == GFX8) &&
+          cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box))
+         return;
+      else if (sctx->chip_class == GFX9 && si_sdma_v4_copy_texture(sctx, dst, dst_level, dstx, dsty,
+                                                                   dstz, src, src_level, src_box))
+         return;
+   }
  
  fallback:
-       si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz,
-                               src, src_level, src_box);
+   si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box);
  }
  
  void cik_init_sdma_functions(struct si_context *sctx)
  {
-       sctx->dma_copy = cik_sdma_copy;
+   sctx->dma_copy = cik_sdma_copy;
  }
diff --git a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h

index 59b3d0a6b49b0657a05e8ccf9eacb437791e2151..1570f2860531af96d9a15c11df27c4936d9631b9 100644 (file)
--- a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
+++ b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
@@ -1,18 +1,18 @@
  // DriConf options specific to radeonsi
  DRI_CONF_SECTION_PERFORMANCE
-    DRI_CONF_ADAPTIVE_SYNC("true")
-    DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
-    DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
-    DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false")
+DRI_CONF_ADAPTIVE_SYNC("true")
+DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
+DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
+DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false")
  DRI_CONF_SECTION_END
  
  DRI_CONF_SECTION_DEBUG
  
  //= BEGIN VERBATIM
-#define OPT_BOOL(name, dflt, description) \
-       DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt) \
-               DRI_CONF_DESC(en, description) \
-       DRI_CONF_OPT_END
+#define OPT_BOOL(name, dflt, description)                                                          \
+   DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt)                                                    \
+   DRI_CONF_DESC(en, description)                                                                  \
+   DRI_CONF_OPT_END
  
  #include "radeonsi/si_debug_options.h"
  //= END VERBATIM
diff --git a/src/gallium/drivers/radeonsi/gfx10_query.c b/src/gallium/drivers/radeonsi/gfx10_query.c

index c0a0bc8ce57fe70edfa7ccd9c8950f196b5dfdf2..aedf5090eed1b9ac7151c86d3fd543c51587155f 100644 (file)
--- a/src/gallium/drivers/radeonsi/gfx10_query.c
+++ b/src/gallium/drivers/radeonsi/gfx10_query.c
@@ -22,13 +22,13 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include <stddef.h>
-
  #include "si_pipe.h"
  #include "si_query.h"
+#include "sid.h"
  #include "util/u_memory.h"
  #include "util/u_suballoc.h"
-#include "sid.h"
+
+#include <stddef.h>
  
  /**
   * The query buffer is written to by ESGS NGG shaders with statistics about
@@ -39,12 +39,12 @@
   * without additional GPU cost.
   */
  struct gfx10_sh_query_buffer {
-       struct list_head list;
-       struct si_resource *buf;
-       unsigned refcount;
+   struct list_head list;
+   struct si_resource *buf;
+   unsigned refcount;
  
-       /* Offset into the buffer in bytes; points at the first un-emitted entry. */
-       unsigned head;
+   /* Offset into the buffer in bytes; points at the first un-emitted entry. */
+   unsigned head;
  };
  
  /* Memory layout of the query buffer. Must be kept in sync with shaders
@@ -55,469 +55,454 @@ struct gfx10_sh_query_buffer {
   * of all those values unconditionally.
   */
  struct gfx10_sh_query_buffer_mem {
-       struct {
-               uint64_t generated_primitives_start_dummy;
-               uint64_t emitted_primitives_start_dummy;
-               uint64_t generated_primitives;
-               uint64_t emitted_primitives;
-       } stream[4];
-       uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
-       uint32_t pad[31];
+   struct {
+      uint64_t generated_primitives_start_dummy;
+      uint64_t emitted_primitives_start_dummy;
+      uint64_t generated_primitives;
+      uint64_t emitted_primitives;
+   } stream[4];
+   uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
+   uint32_t pad[31];
  };
  
  /* Shader-based queries. */
  struct gfx10_sh_query {
-       struct si_query b;
+   struct si_query b;
  
-       struct gfx10_sh_query_buffer *first;
-       struct gfx10_sh_query_buffer *last;
-       unsigned first_begin;
-       unsigned last_end;
+   struct gfx10_sh_query_buffer *first;
+   struct gfx10_sh_query_buffer *last;
+   unsigned first_begin;
+   unsigned last_end;
  
-       unsigned stream;
+   unsigned stream;
  };
  
  static void emit_shader_query(struct si_context *sctx)
  {
-       assert(!list_is_empty(&sctx->shader_query_buffers));
+   assert(!list_is_empty(&sctx->shader_query_buffers));
  
-       struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers,
-                                                            struct gfx10_sh_query_buffer, list);
-       qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
+   struct gfx10_sh_query_buffer *qbuf =
+      list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+   qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
  }
  
  static void gfx10_release_query_buffers(struct si_context *sctx,
-                                       struct gfx10_sh_query_buffer *first,
-                                       struct gfx10_sh_query_buffer *last)
+                                        struct gfx10_sh_query_buffer *first,
+                                        struct gfx10_sh_query_buffer *last)
  {
-       while (first) {
-               struct gfx10_sh_query_buffer *qbuf = first;
-               if (first != last)
-                       first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
-               else
-                       first = NULL;
-
-               qbuf->refcount--;
-               if (qbuf->refcount)
-                       continue;
-
-               if (qbuf->list.next == &sctx->shader_query_buffers)
-                       continue; /* keep the most recent buffer; it may not be full yet */
-               if (qbuf->list.prev == &sctx->shader_query_buffers)
-                       continue; /* keep the oldest buffer for recycling */
-
-               list_del(&qbuf->list);
-               si_resource_reference(&qbuf->buf, NULL);
-               FREE(qbuf);
-       }
+   while (first) {
+      struct gfx10_sh_query_buffer *qbuf = first;
+      if (first != last)
+         first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+      else
+         first = NULL;
+
+      qbuf->refcount--;
+      if (qbuf->refcount)
+         continue;
+
+      if (qbuf->list.next == &sctx->shader_query_buffers)
+         continue; /* keep the most recent buffer; it may not be full yet */
+      if (qbuf->list.prev == &sctx->shader_query_buffers)
+         continue; /* keep the oldest buffer for recycling */
+
+      list_del(&qbuf->list);
+      si_resource_reference(&qbuf->buf, NULL);
+      FREE(qbuf);
+   }
  }
  
  static bool gfx10_alloc_query_buffer(struct si_context *sctx)
  {
-       if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
-               return true;
-
-       struct gfx10_sh_query_buffer *qbuf = NULL;
-
-       if (!list_is_empty(&sctx->shader_query_buffers)) {
-               qbuf = list_last_entry(&sctx->shader_query_buffers,
-                                      struct gfx10_sh_query_buffer, list);
-               if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
-                       goto success;
-
-               qbuf = list_first_entry(&sctx->shader_query_buffers,
-                                       struct gfx10_sh_query_buffer, list);
-               if (!qbuf->refcount &&
-                   !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
-                   sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
-                       /* Can immediately re-use the oldest buffer */
-                       list_del(&qbuf->list);
-               } else {
-                       qbuf = NULL;
-               }
-       }
-
-       if (!qbuf) {
-               qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
-               if (unlikely(!qbuf))
-                       return false;
-
-               struct si_screen *screen = sctx->screen;
-               unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem),
-                                        screen->info.min_alloc_size);
-               qbuf->buf = si_resource(
-                       pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
-               if (unlikely(!qbuf->buf)) {
-                       FREE(qbuf);
-                       return false;
-               }
-       }
-
-       /* The buffer is currently unused by the GPU. Initialize it.
-        *
-        * We need to set the high bit of all the primitive counters for
-        * compatibility with the SET_PREDICATION packet.
-        */
-       uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
-                                                PIPE_TRANSFER_WRITE |
-                                                PIPE_TRANSFER_UNSYNCHRONIZED);
-       assert(results);
-
-       for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem);
-            i < e; ++i) {
-               for (unsigned j = 0; j < 16; ++j)
-                       results[32 * i + j] = (uint64_t)1 << 63;
-               results[32 * i + 16] = 0;
-       }
-
-       list_addtail(&qbuf->list, &sctx->shader_query_buffers);
-       qbuf->head = 0;
-       qbuf->refcount = sctx->num_active_shader_queries;
+   if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
+      return true;
+
+   struct gfx10_sh_query_buffer *qbuf = NULL;
+
+   if (!list_is_empty(&sctx->shader_query_buffers)) {
+      qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+      if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
+         goto success;
+
+      qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+      if (!qbuf->refcount &&
+          !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
+          sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
+         /* Can immediately re-use the oldest buffer */
+         list_del(&qbuf->list);
+      } else {
+         qbuf = NULL;
+      }
+   }
+
+   if (!qbuf) {
+      qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
+      if (unlikely(!qbuf))
+         return false;
+
+      struct si_screen *screen = sctx->screen;
+      unsigned buf_size =
+         MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size);
+      qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+      if (unlikely(!qbuf->buf)) {
+         FREE(qbuf);
+         return false;
+      }
+   }
+
+   /* The buffer is currently unused by the GPU. Initialize it.
+    *
+    * We need to set the high bit of all the primitive counters for
+    * compatibility with the SET_PREDICATION packet.
+    */
+   uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
+                                            PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED);
+   assert(results);
+
+   for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e;
+        ++i) {
+      for (unsigned j = 0; j < 16; ++j)
+         results[32 * i + j] = (uint64_t)1 << 63;
+      results[32 * i + 16] = 0;
+   }
+
+   list_addtail(&qbuf->list, &sctx->shader_query_buffers);
+   qbuf->head = 0;
+   qbuf->refcount = sctx->num_active_shader_queries;
  
  success:;
-       struct pipe_shader_buffer sbuf;
-       sbuf.buffer = &qbuf->buf->b.b;
-       sbuf.buffer_offset = qbuf->head;
-       sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
-       si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
-       sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
-
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
-       return true;
+   struct pipe_shader_buffer sbuf;
+   sbuf.buffer = &qbuf->buf->b.b;
+   sbuf.buffer_offset = qbuf->head;
+   sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
+   si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
+   sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
+
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
+   return true;
  }
  
  static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
  {
-       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
-       gfx10_release_query_buffers(sctx, query->first, query->last);
-       FREE(query);
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   gfx10_release_query_buffers(sctx, query->first, query->last);
+   FREE(query);
  }
  
  static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
  {
-       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
  
-       gfx10_release_query_buffers(sctx, query->first, query->last);
-       query->first = query->last = NULL;
+   gfx10_release_query_buffers(sctx, query->first, query->last);
+   query->first = query->last = NULL;
  
-       if (unlikely(!gfx10_alloc_query_buffer(sctx)))
-               return false;
+   if (unlikely(!gfx10_alloc_query_buffer(sctx)))
+      return false;
  
-       query->first = list_last_entry(&sctx->shader_query_buffers,
-                                      struct gfx10_sh_query_buffer, list);
-       query->first_begin = query->first->head;
+   query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+   query->first_begin = query->first->head;
  
-       sctx->num_active_shader_queries++;
-       query->first->refcount++;
+   sctx->num_active_shader_queries++;
+   query->first->refcount++;
  
-       return true;
+   return true;
  }
  
  static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
  {
-       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
-
-       if (unlikely(!query->first))
-               return false; /* earlier out of memory error */
-
-       query->last = list_last_entry(&sctx->shader_query_buffers,
-                                     struct gfx10_sh_query_buffer, list);
-       query->last_end = query->last->head;
-
-       /* Signal the fence of the previous chunk */
-       if (query->last_end != 0) {
-               uint64_t fence_va = query->last->buf->gpu_address;
-               fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
-               fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
-               si_cp_release_mem(sctx, sctx->gfx_cs,
-                                 V_028A90_BOTTOM_OF_PIPE_TS, 0,
-                                 EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-                                 EOP_DATA_SEL_VALUE_32BIT,
-                                 query->last->buf, fence_va, 0xffffffff,
-                                 PIPE_QUERY_GPU_FINISHED);
-       }
-
-       sctx->num_active_shader_queries--;
-
-       if (sctx->num_active_shader_queries > 0) {
-               gfx10_alloc_query_buffer(sctx);
-       } else {
-               si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
-               sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
-
-               /* If a query_begin is followed by a query_end without a draw
-                * in-between, we need to clear the atom to ensure that the
-                * next query_begin will re-initialize the shader buffer. */
-               si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
-       }
-
-       return true;
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+   if (unlikely(!query->first))
+      return false; /* earlier out of memory error */
+
+   query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+   query->last_end = query->last->head;
+
+   /* Signal the fence of the previous chunk */
+   if (query->last_end != 0) {
+      uint64_t fence_va = query->last->buf->gpu_address;
+      fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
+      fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+      si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
+                        EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
+                        0xffffffff, PIPE_QUERY_GPU_FINISHED);
+   }
+
+   sctx->num_active_shader_queries--;
+
+   if (sctx->num_active_shader_queries > 0) {
+      gfx10_alloc_query_buffer(sctx);
+   } else {
+      si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
+      sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
+
+      /* If a query_begin is followed by a query_end without a draw
+       * in-between, we need to clear the atom to ensure that the
+       * next query_begin will re-initialize the shader buffer. */
+      si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
+   }
+
+   return true;
  }
  
  static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
-                                     struct gfx10_sh_query_buffer_mem *qmem,
-                                     union pipe_query_result *result)
+                                      struct gfx10_sh_query_buffer_mem *qmem,
+                                      union pipe_query_result *result)
  {
-       static const uint64_t mask = ((uint64_t)1 << 63) - 1;
-
-       switch (query->b.type) {
-       case PIPE_QUERY_PRIMITIVES_EMITTED:
-               result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
-               break;
-       case PIPE_QUERY_PRIMITIVES_GENERATED:
-               result->u64 += qmem->stream[query->stream].generated_primitives & mask;
-               break;
-       case PIPE_QUERY_SO_STATISTICS:
-               result->so_statistics.num_primitives_written +=
-                       qmem->stream[query->stream].emitted_primitives & mask;
-               result->so_statistics.primitives_storage_needed +=
-                       qmem->stream[query->stream].generated_primitives & mask;
-               break;
-       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-               result->b |= qmem->stream[query->stream].emitted_primitives !=
-                            qmem->stream[query->stream].generated_primitives;
-               break;
-       case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-               for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
-                       result->b |= qmem->stream[query->stream].emitted_primitives !=
-                                    qmem->stream[query->stream].generated_primitives;
-               }
-               break;
-       default:
-               assert(0);
-       }
+   static const uint64_t mask = ((uint64_t)1 << 63) - 1;
+
+   switch (query->b.type) {
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      result->u64 += qmem->stream[query->stream].generated_primitives & mask;
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      result->so_statistics.num_primitives_written +=
+         qmem->stream[query->stream].emitted_primitives & mask;
+      result->so_statistics.primitives_storage_needed +=
+         qmem->stream[query->stream].generated_primitives & mask;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      result->b |= qmem->stream[query->stream].emitted_primitives !=
+                   qmem->stream[query->stream].generated_primitives;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+         result->b |= qmem->stream[query->stream].emitted_primitives !=
+                      qmem->stream[query->stream].generated_primitives;
+      }
+      break;
+   default:
+      assert(0);
+   }
  }
  
-static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery,
-                                     bool wait, union pipe_query_result *result)
+static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
+                                      union pipe_query_result *result)
  {
-       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
  
-       util_query_clear_result(result, query->b.type);
+   util_query_clear_result(result, query->b.type);
  
-       if (unlikely(!query->first))
-               return false; /* earlier out of memory error */
-       assert(query->last);
+   if (unlikely(!query->first))
+      return false; /* earlier out of memory error */
+   assert(query->last);
  
-       for (struct gfx10_sh_query_buffer *qbuf = query->last;;
-            qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
-               unsigned usage = PIPE_TRANSFER_READ |
-                                (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
-               void *map;
+   for (struct gfx10_sh_query_buffer *qbuf = query->last;;
+        qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
+      unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+      void *map;
  
-               if (rquery->b.flushed)
-                       map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
-               else
-                       map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+      if (rquery->b.flushed)
+         map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+      else
+         map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
  
-               if (!map)
-                       return false;
+      if (!map)
+         return false;
  
-               unsigned results_begin = 0;
-               unsigned results_end = qbuf->head;
-               if (qbuf == query->first)
-                       results_begin = query->first_begin;
-               if (qbuf == query->last)
-                       results_end = query->last_end;
+      unsigned results_begin = 0;
+      unsigned results_end = qbuf->head;
+      if (qbuf == query->first)
+         results_begin = query->first_begin;
+      if (qbuf == query->last)
+         results_end = query->last_end;
  
-               while (results_begin != results_end) {
-                       struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
-                       results_begin += sizeof(*qmem);
+      while (results_begin != results_end) {
+         struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
+         results_begin += sizeof(*qmem);
  
-                       gfx10_sh_query_add_result(query, qmem, result);
-               }
+         gfx10_sh_query_add_result(query, qmem, result);
+      }
  
-               if (qbuf == query->first)
-                       break;
-       }
+      if (qbuf == query->first)
+         break;
+   }
  
-       return true;
+   return true;
  }
  
-static void gfx10_sh_query_get_result_resource(struct si_context *sctx,
-                                              struct si_query *rquery,
-                                              bool wait,
-                                              enum pipe_query_value_type result_type,
-                                              int index,
-                                              struct pipe_resource *resource,
-                                              unsigned offset)
+static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
+                                               bool wait, enum pipe_query_value_type result_type,
+                                               int index, struct pipe_resource *resource,
+                                               unsigned offset)
  {
-       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
-       struct si_qbo_state saved_state = {};
-       struct pipe_resource *tmp_buffer = NULL;
-       unsigned tmp_buffer_offset = 0;
-
-       if (!sctx->sh_query_result_shader) {
-               sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
-               if (!sctx->sh_query_result_shader)
-                       return;
-       }
-
-       if (query->first != query->last) {
-               u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
-                                    &tmp_buffer_offset, &tmp_buffer);
-               if (!tmp_buffer)
-                       return;
-       }
-
-       si_save_qbo_state(sctx, &saved_state);
-
-       /* Pre-fill the constants configuring the shader behavior. */
-       struct {
-               uint32_t config;
-               uint32_t offset;
-               uint32_t chain;
-               uint32_t result_count;
-       } consts;
-       struct pipe_constant_buffer constant_buffer = {};
-
-       if (index >= 0) {
-               switch (query->b.type) {
-               case PIPE_QUERY_PRIMITIVES_GENERATED:
-                       consts.offset = sizeof(uint32_t) * query->stream;
-                       consts.config = 0;
-                       break;
-               case PIPE_QUERY_PRIMITIVES_EMITTED:
-                       consts.offset = sizeof(uint32_t) * (4 + query->stream);
-                       consts.config = 0;
-                       break;
-               case PIPE_QUERY_SO_STATISTICS:
-                       consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
-                       consts.config = 0;
-                       break;
-               case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-                       consts.offset = sizeof(uint32_t) * query->stream;
-                       consts.config = 2;
-                       break;
-               case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-                       consts.offset = 0;
-                       consts.config = 3;
-                       break;
-               default: unreachable("bad query type");
-               }
-       } else {
-               /* Check result availability. */
-               consts.offset = 0;
-               consts.config = 1;
-       }
-
-       if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
-               consts.config |= 8;
-
-       constant_buffer.buffer_size = sizeof(consts);
-       constant_buffer.user_buffer = &consts;
-
-       /* Pre-fill the SSBOs and grid. */
-       struct pipe_shader_buffer ssbo[3];
-       struct pipe_grid_info grid = {};
-
-       ssbo[1].buffer = tmp_buffer;
-       ssbo[1].buffer_offset = tmp_buffer_offset;
-       ssbo[1].buffer_size = 16;
-
-       ssbo[2] = ssbo[1];
-
-       sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
-
-       grid.block[0] = 1;
-       grid.block[1] = 1;
-       grid.block[2] = 1;
-       grid.grid[0] = 1;
-       grid.grid[1] = 1;
-       grid.grid[2] = 1;
-
-       struct gfx10_sh_query_buffer *qbuf = query->first;
-       for (;;) {
-               unsigned begin = qbuf == query->first ? query->first_begin : 0;
-               unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
-               if (!end)
-                       continue;
-
-               ssbo[0].buffer = &qbuf->buf->b.b;
-               ssbo[0].buffer_offset = begin;
-               ssbo[0].buffer_size = end - begin;
-
-               consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
-               consts.chain = 0;
-               if (qbuf != query->first)
-                       consts.chain |= 1;
-               if (qbuf != query->last)
-                       consts.chain |= 2;
-
-               if (qbuf == query->last) {
-                       ssbo[2].buffer = resource;
-                       ssbo[2].buffer_offset = offset;
-                       ssbo[2].buffer_size = 8;
-               }
-
-               sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
-               sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
-
-               if (wait) {
-                       uint64_t va;
-
-                       /* Wait for result availability. Wait only for readiness
-                        * of the last entry, since the fence writes should be
-                        * serialized in the CP.
-                        */
-                       va = qbuf->buf->gpu_address;
-                       va += end - sizeof(struct gfx10_sh_query_buffer_mem);
-                       va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
-
-                       si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
-               }
-
-               sctx->b.launch_grid(&sctx->b, &grid);
-               sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-
-               if (qbuf == query->last)
-                       break;
-               qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
-       }
-
-       si_restore_qbo_state(sctx, &saved_state);
-       pipe_resource_reference(&tmp_buffer, NULL);
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   struct si_qbo_state saved_state = {};
+   struct pipe_resource *tmp_buffer = NULL;
+   unsigned tmp_buffer_offset = 0;
+
+   if (!sctx->sh_query_result_shader) {
+      sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
+      if (!sctx->sh_query_result_shader)
+         return;
+   }
+
+   if (query->first != query->last) {
+      u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
+      if (!tmp_buffer)
+         return;
+   }
+
+   si_save_qbo_state(sctx, &saved_state);
+
+   /* Pre-fill the constants configuring the shader behavior. */
+   struct {
+      uint32_t config;
+      uint32_t offset;
+      uint32_t chain;
+      uint32_t result_count;
+   } consts;
+   struct pipe_constant_buffer constant_buffer = {};
+
+   if (index >= 0) {
+      switch (query->b.type) {
+      case PIPE_QUERY_PRIMITIVES_GENERATED:
+         consts.offset = sizeof(uint32_t) * query->stream;
+         consts.config = 0;
+         break;
+      case PIPE_QUERY_PRIMITIVES_EMITTED:
+         consts.offset = sizeof(uint32_t) * (4 + query->stream);
+         consts.config = 0;
+         break;
+      case PIPE_QUERY_SO_STATISTICS:
+         consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
+         consts.config = 0;
+         break;
+      case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+         consts.offset = sizeof(uint32_t) * query->stream;
+         consts.config = 2;
+         break;
+      case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+         consts.offset = 0;
+         consts.config = 3;
+         break;
+      default:
+         unreachable("bad query type");
+      }
+   } else {
+      /* Check result availability. */
+      consts.offset = 0;
+      consts.config = 1;
+   }
+
+   if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
+      consts.config |= 8;
+
+   constant_buffer.buffer_size = sizeof(consts);
+   constant_buffer.user_buffer = &consts;
+
+   /* Pre-fill the SSBOs and grid. */
+   struct pipe_shader_buffer ssbo[3];
+   struct pipe_grid_info grid = {};
+
+   ssbo[1].buffer = tmp_buffer;
+   ssbo[1].buffer_offset = tmp_buffer_offset;
+   ssbo[1].buffer_size = 16;
+
+   ssbo[2] = ssbo[1];
+
+   sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
+
+   grid.block[0] = 1;
+   grid.block[1] = 1;
+   grid.block[2] = 1;
+   grid.grid[0] = 1;
+   grid.grid[1] = 1;
+   grid.grid[2] = 1;
+
+   struct gfx10_sh_query_buffer *qbuf = query->first;
+   for (;;) {
+      unsigned begin = qbuf == query->first ? query->first_begin : 0;
+      unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
+      if (!end)
+         continue;
+
+      ssbo[0].buffer = &qbuf->buf->b.b;
+      ssbo[0].buffer_offset = begin;
+      ssbo[0].buffer_size = end - begin;
+
+      consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
+      consts.chain = 0;
+      if (qbuf != query->first)
+         consts.chain |= 1;
+      if (qbuf != query->last)
+         consts.chain |= 2;
+
+      if (qbuf == query->last) {
+         ssbo[2].buffer = resource;
+         ssbo[2].buffer_offset = offset;
+         ssbo[2].buffer_size = 8;
+      }
+
+      sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+      sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
+
+      if (wait) {
+         uint64_t va;
+
+         /* Wait for result availability. Wait only for readiness
+          * of the last entry, since the fence writes should be
+          * serialized in the CP.
+          */
+         va = qbuf->buf->gpu_address;
+         va += end - sizeof(struct gfx10_sh_query_buffer_mem);
+         va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+
+         si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
+      }
+
+      sctx->b.launch_grid(&sctx->b, &grid);
+      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+      if (qbuf == query->last)
+         break;
+      qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+   }
+
+   si_restore_qbo_state(sctx, &saved_state);
+   pipe_resource_reference(&tmp_buffer, NULL);
  }
  
  static const struct si_query_ops gfx10_sh_query_ops = {
-       .destroy = gfx10_sh_query_destroy,
-       .begin = gfx10_sh_query_begin,
-       .end = gfx10_sh_query_end,
-       .get_result = gfx10_sh_query_get_result,
-       .get_result_resource = gfx10_sh_query_get_result_resource,
+   .destroy = gfx10_sh_query_destroy,
+   .begin = gfx10_sh_query_begin,
+   .end = gfx10_sh_query_end,
+   .get_result = gfx10_sh_query_get_result,
+   .get_result_resource = gfx10_sh_query_get_result_resource,
  };
  
-struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
-                                        enum pipe_query_type query_type,
-                                        unsigned index)
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
+                                         unsigned index)
  {
-       struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
-       if (unlikely(!query))
-               return NULL;
+   struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
+   if (unlikely(!query))
+      return NULL;
  
-       query->b.ops = &gfx10_sh_query_ops;
-       query->b.type = query_type;
-       query->stream = index;
+   query->b.ops = &gfx10_sh_query_ops;
+   query->b.type = query_type;
+   query->stream = index;
  
-       return (struct pipe_query *)query;
+   return (struct pipe_query *)query;
  }
  
  void gfx10_init_query(struct si_context *sctx)
  {
-       list_inithead(&sctx->shader_query_buffers);
-       sctx->atoms.s.shader_query.emit = emit_shader_query;
+   list_inithead(&sctx->shader_query_buffers);
+   sctx->atoms.s.shader_query.emit = emit_shader_query;
  }
  
  void gfx10_destroy_query(struct si_context *sctx)
  {
-       while (!list_is_empty(&sctx->shader_query_buffers)) {
-               struct gfx10_sh_query_buffer *qbuf =
-                       list_first_entry(&sctx->shader_query_buffers,
-                                        struct gfx10_sh_query_buffer, list);
-               list_del(&qbuf->list);
-
-               assert(!qbuf->refcount);
-               si_resource_reference(&qbuf->buf, NULL);
-               FREE(qbuf);
-       }
+   while (!list_is_empty(&sctx->shader_query_buffers)) {
+      struct gfx10_sh_query_buffer *qbuf =
+         list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+      list_del(&qbuf->list);
+
+      assert(!qbuf->refcount);
+      si_resource_reference(&qbuf->buf, NULL);
+      FREE(qbuf);
+   }
  }
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c

index 63439733507e2dfe3349b4089c6a180f2ceb3476..06eba4a1f61f2bcab2613771833aa6e373b5c427 100644 (file)
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -21,250 +21,239 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
+#include "ac_llvm_cull.h"
  #include "si_pipe.h"
  #include "si_shader_internal.h"
-
  #include "sid.h"
-
  #include "util/u_memory.h"
  #include "util/u_prim.h"
-#include "ac_llvm_cull.h"
  
  static LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx)
  {
-       return si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
+   return si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
  }
  
  static LLVMValueRef get_tgsize(struct si_shader_context *ctx)
  {
-       return si_unpack_param(ctx, ctx->merged_wave_info, 28, 4);
+   return si_unpack_param(ctx, ctx->merged_wave_info, 28, 4);
  }
  
  static LLVMValueRef get_thread_id_in_tg(struct si_shader_context *ctx)
  {
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef tmp;
-       tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
-                          LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
-       return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef tmp;
+   tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
+                      LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
+   return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
  }
  
  static LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx)
  {
-       return si_unpack_param(ctx, ctx->gs_tg_info, 12, 9);
+   return si_unpack_param(ctx, ctx->gs_tg_info, 12, 9);
  }
  
  static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
  {
-       return si_unpack_param(ctx, ctx->gs_tg_info, 22, 9);
+   return si_unpack_param(ctx, ctx->gs_tg_info, 22, 9);
  }
  
  static LLVMValueRef ngg_get_ordered_id(struct si_shader_context *ctx)
  {
-       return si_unpack_param(ctx, ctx->gs_tg_info, 0, 12);
+   return si_unpack_param(ctx, ctx->gs_tg_info, 0, 12);
  }
  
  static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
  {
-       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
  
-       return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
-                                    LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false));
+   return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
+                                LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false));
  }
  
  static LLVMValueRef ngg_get_initial_edgeflag(struct si_shader_context *ctx, unsigned index)
  {
-       if (ctx->type == PIPE_SHADER_VERTEX) {
-               LLVMValueRef tmp;
-               tmp = LLVMBuildLShr(ctx->ac.builder,
-                                   ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id),
-                                   LLVMConstInt(ctx->ac.i32, 8 + index, false), "");
-               return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
-       }
-       return ctx->ac.i1false;
+   if (ctx->type == PIPE_SHADER_VERTEX) {
+      LLVMValueRef tmp;
+      tmp = LLVMBuildLShr(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id),
+                          LLVMConstInt(ctx->ac.i32, 8 + index, false), "");
+      return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
+   }
+   return ctx->ac.i1false;
  }
  
  /**
   * Return the number of vertices as a constant in \p num_vertices,
   * and return a more precise value as LLVMValueRef from the function.
   */
-static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx,
-                                             unsigned *num_vertices)
+static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, unsigned *num_vertices)
  {
-       const struct si_shader_info *info = &ctx->shader->selector->info;
-
-       if (ctx->type == PIPE_SHADER_VERTEX) {
-               if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
-                       /* Blits always use axis-aligned rectangles with 3 vertices. */
-                       *num_vertices = 3;
-                       return LLVMConstInt(ctx->ac.i32, 3, 0);
-               } else {
-                       /* We always build up all three indices for the prim export
-                        * independent of the primitive type. The additional garbage
-                        * data shouldn't hurt. This number doesn't matter with
-                        * NGG passthrough.
-                        */
-                       *num_vertices = 3;
-
-                       /* Extract OUTPRIM field. */
-                       LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2);
-                       return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, "");
-               }
-       } else {
-               assert(ctx->type == PIPE_SHADER_TESS_EVAL);
-
-               if (info->properties[TGSI_PROPERTY_TES_POINT_MODE])
-                       *num_vertices = 1;
-               else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
-                       *num_vertices = 2;
-               else
-                       *num_vertices = 3;
-
-               return LLVMConstInt(ctx->ac.i32, *num_vertices, false);
-       }
+   const struct si_shader_info *info = &ctx->shader->selector->info;
+
+   if (ctx->type == PIPE_SHADER_VERTEX) {
+      if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
+         /* Blits always use axis-aligned rectangles with 3 vertices. */
+         *num_vertices = 3;
+         return LLVMConstInt(ctx->ac.i32, 3, 0);
+      } else {
+         /* We always build up all three indices for the prim export
+          * independent of the primitive type. The additional garbage
+          * data shouldn't hurt. This number doesn't matter with
+          * NGG passthrough.
+          */
+         *num_vertices = 3;
+
+         /* Extract OUTPRIM field. */
+         LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2);
+         return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, "");
+      }
+   } else {
+      assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+
+      if (info->properties[TGSI_PROPERTY_TES_POINT_MODE])
+         *num_vertices = 1;
+      else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
+         *num_vertices = 2;
+      else
+         *num_vertices = 3;
+
+      return LLVMConstInt(ctx->ac.i32, *num_vertices, false);
+   }
  }
  
  bool gfx10_ngg_export_prim_early(struct si_shader *shader)
  {
-       struct si_shader_selector *sel = shader->selector;
+   struct si_shader_selector *sel = shader->selector;
  
-       assert(shader->key.as_ngg && !shader->key.as_es);
+   assert(shader->key.as_ngg && !shader->key.as_es);
  
-       return sel->type != PIPE_SHADER_GEOMETRY &&
-              !sel->info.writes_edgeflag;
+   return sel->type != PIPE_SHADER_GEOMETRY && !sel->info.writes_edgeflag;
  }
  
  void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx)
  {
-       ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
-                                     ngg_get_vtx_cnt(ctx),
-                                     ngg_get_prim_cnt(ctx));
+   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx),
+                                 ngg_get_prim_cnt(ctx));
  }
  
-void gfx10_ngg_build_export_prim(struct si_shader_context *ctx,
-                                LLVMValueRef user_edgeflags[3],
-                                LLVMValueRef prim_passthrough)
+void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
+                                 LLVMValueRef prim_passthrough)
  {
-       LLVMBuilderRef builder = ctx->ac.builder;
-
-       if (gfx10_is_ngg_passthrough(ctx->shader) ||
-           ctx->shader->key.opt.ngg_culling) {
-               ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
-               {
-                       struct ac_ngg_prim prim = {};
-
-                       if (prim_passthrough)
-                               prim.passthrough = prim_passthrough;
-                       else
-                               prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
-
-                       /* This is only used with NGG culling, which returns the NGG
-                        * passthrough prim export encoding.
-                        */
-                       if (ctx->shader->selector->info.writes_edgeflag) {
-                               unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS;
-                               LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0);
-
-                               unsigned num_vertices;
-                               ngg_get_vertices_per_prim(ctx, &num_vertices);
-
-                               for (unsigned i = 0; i < num_vertices; i++) {
-                                       unsigned shift = 9 + i*10;
-                                       LLVMValueRef edge;
-
-                                       edge = LLVMBuildLoad(builder, user_edgeflags[i], "");
-                                       edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, "");
-                                       edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), "");
-                                       edgeflags = LLVMBuildOr(builder, edgeflags, edge, "");
-                               }
-                               prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, "");
-                       }
-
-                       ac_build_export_prim(&ctx->ac, &prim);
-               }
-               ac_build_endif(&ctx->ac, 6001);
-               return;
-       }
-
-       ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
-       {
-               struct ac_ngg_prim prim = {};
-
-               ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
-
-               prim.isnull = ctx->ac.i1false;
-               prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
-               prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
-               prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
-
-               for (unsigned i = 0; i < prim.num_vertices; ++i) {
-                       prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
-
-                       if (ctx->shader->selector->info.writes_edgeflag) {
-                               LLVMValueRef edge;
-
-                               edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], "");
-                               edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, "");
-                               prim.edgeflag[i] = edge;
-                       }
-               }
-
-               ac_build_export_prim(&ctx->ac, &prim);
-       }
-       ac_build_endif(&ctx->ac, 6001);
+   LLVMBuilderRef builder = ctx->ac.builder;
+
+   if (gfx10_is_ngg_passthrough(ctx->shader) || ctx->shader->key.opt.ngg_culling) {
+      ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
+      {
+         struct ac_ngg_prim prim = {};
+
+         if (prim_passthrough)
+            prim.passthrough = prim_passthrough;
+         else
+            prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
+
+         /* This is only used with NGG culling, which returns the NGG
+          * passthrough prim export encoding.
+          */
+         if (ctx->shader->selector->info.writes_edgeflag) {
+            unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS;
+            LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0);
+
+            unsigned num_vertices;
+            ngg_get_vertices_per_prim(ctx, &num_vertices);
+
+            for (unsigned i = 0; i < num_vertices; i++) {
+               unsigned shift = 9 + i * 10;
+               LLVMValueRef edge;
+
+               edge = LLVMBuildLoad(builder, user_edgeflags[i], "");
+               edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, "");
+               edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), "");
+               edgeflags = LLVMBuildOr(builder, edgeflags, edge, "");
+            }
+            prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, "");
+         }
+
+         ac_build_export_prim(&ctx->ac, &prim);
+      }
+      ac_build_endif(&ctx->ac, 6001);
+      return;
+   }
+
+   ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
+   {
+      struct ac_ngg_prim prim = {};
+
+      ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
+
+      prim.isnull = ctx->ac.i1false;
+      prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
+      prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
+      prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
+
+      for (unsigned i = 0; i < prim.num_vertices; ++i) {
+         prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
+
+         if (ctx->shader->selector->info.writes_edgeflag) {
+            LLVMValueRef edge;
+
+            edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], "");
+            edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, "");
+            prim.edgeflag[i] = edge;
+         }
+      }
+
+      ac_build_export_prim(&ctx->ac, &prim);
+   }
+   ac_build_endif(&ctx->ac, 6001);
  }
  
-static void build_streamout_vertex(struct si_shader_context *ctx,
-                                  LLVMValueRef *so_buffer, LLVMValueRef *wg_offset_dw,
-                                  unsigned stream, LLVMValueRef offset_vtx,
-                                  LLVMValueRef vertexptr)
+static void build_streamout_vertex(struct si_shader_context *ctx, LLVMValueRef *so_buffer,
+                                   LLVMValueRef *wg_offset_dw, unsigned stream,
+                                   LLVMValueRef offset_vtx, LLVMValueRef vertexptr)
  {
-       struct si_shader_info *info = &ctx->shader->selector->info;
-       struct pipe_stream_output_info *so = &ctx->shader->selector->so;
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef offset[4] = {};
-       LLVMValueRef tmp;
-
-       for (unsigned buffer = 0; buffer < 4; ++buffer) {
-               if (!wg_offset_dw[buffer])
-                       continue;
-
-               tmp = LLVMBuildMul(builder, offset_vtx,
-                                  LLVMConstInt(ctx->ac.i32, so->stride[buffer], false), "");
-               tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
-               offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
-       }
-
-       for (unsigned i = 0; i < so->num_outputs; ++i) {
-               if (so->output[i].stream != stream)
-                       continue;
-
-               unsigned reg = so->output[i].register_index;
-               struct si_shader_output_values out;
-               out.semantic_name = info->output_semantic_name[reg];
-               out.semantic_index = info->output_semantic_index[reg];
-
-               for (unsigned comp = 0; comp < 4; comp++) {
-                       tmp = ac_build_gep0(&ctx->ac, vertexptr,
-                                           LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false));
-                       out.values[comp] = LLVMBuildLoad(builder, tmp, "");
-                       out.vertex_stream[comp] =
-                               (info->output_streams[reg] >> (2 * comp)) & 3;
-               }
-
-               si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out);
-       }
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   struct pipe_stream_output_info *so = &ctx->shader->selector->so;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef offset[4] = {};
+   LLVMValueRef tmp;
+
+   for (unsigned buffer = 0; buffer < 4; ++buffer) {
+      if (!wg_offset_dw[buffer])
+         continue;
+
+      tmp = LLVMBuildMul(builder, offset_vtx, LLVMConstInt(ctx->ac.i32, so->stride[buffer], false),
+                         "");
+      tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
+      offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
+   }
+
+   for (unsigned i = 0; i < so->num_outputs; ++i) {
+      if (so->output[i].stream != stream)
+         continue;
+
+      unsigned reg = so->output[i].register_index;
+      struct si_shader_output_values out;
+      out.semantic_name = info->output_semantic_name[reg];
+      out.semantic_index = info->output_semantic_index[reg];
+
+      for (unsigned comp = 0; comp < 4; comp++) {
+         tmp = ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false));
+         out.values[comp] = LLVMBuildLoad(builder, tmp, "");
+         out.vertex_stream[comp] = (info->output_streams[reg] >> (2 * comp)) & 3;
+      }
+
+      si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out);
+   }
  }
  
  struct ngg_streamout {
-       LLVMValueRef num_vertices;
+   LLVMValueRef num_vertices;
  
-       /* per-thread data */
-       LLVMValueRef prim_enable[4]; /* i1 per stream */
-       LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */
+   /* per-thread data */
+   LLVMValueRef prim_enable[4]; /* i1 per stream */
+   LLVMValueRef vertices[3];    /* [N x i32] addrspace(LDS)* */
  
-       /* Output */
-       LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
+   /* Output */
+   LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
  };
  
  /**
@@ -276,427 +265,405 @@ struct ngg_streamout {
   *
   * Clobbers gs_ngg_scratch[8:].
   */
-static void build_streamout(struct si_shader_context *ctx,
-                           struct ngg_streamout *nggso)
+static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout *nggso)
  {
-       struct si_shader_info *info = &ctx->shader->selector->info;
-       struct pipe_stream_output_info *so = &ctx->shader->selector->so;
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-       LLVMValueRef tid = get_thread_id_in_tg(ctx);
-       LLVMValueRef tmp, tmp2;
-       LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
-       LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
-       LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
-       LLVMValueRef so_buffer[4] = {};
-       unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) +
-                                       (nggso->vertices[2] ? 1 : 0);
-       LLVMValueRef prim_stride_dw[4] = {};
-       LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
-       int stream_for_buffer[4] = { -1, -1, -1, -1 };
-       unsigned bufmask_for_stream[4] = {};
-       bool isgs = ctx->type == PIPE_SHADER_GEOMETRY;
-       unsigned scratch_emit_base = isgs ? 4 : 0;
-       LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
-       unsigned scratch_offset_base = isgs ? 8 : 4;
-       LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
-
-       ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
-
-       /* Determine the mapping of streamout buffers to vertex streams. */
-       for (unsigned i = 0; i < so->num_outputs; ++i) {
-               unsigned buf = so->output[i].output_buffer;
-               unsigned stream = so->output[i].stream;
-               assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
-               stream_for_buffer[buf] = stream;
-               bufmask_for_stream[stream] |= 1 << buf;
-       }
-
-       for (unsigned buffer = 0; buffer < 4; ++buffer) {
-               if (stream_for_buffer[buffer] == -1)
-                       continue;
-
-               assert(so->stride[buffer]);
-
-               tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false);
-               prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, "");
-               prim_stride_dw_vgpr = ac_build_writelane(
-                       &ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
-                       LLVMConstInt(ctx->ac.i32, buffer, false));
-
-               so_buffer[buffer] = ac_build_load_to_sgpr(
-                       &ctx->ac, buf_ptr,
-                       LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false));
-       }
-
-       tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
-       ac_build_ifcc(&ctx->ac, tmp, 5200);
-       {
-               LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
-               LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
-
-               /* Advance the streamout offsets in GDS. */
-               LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-               LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
-               tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
-               ac_build_ifcc(&ctx->ac, tmp, 5210);
-               {
-                       if (isgs) {
-                               tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
-                               tmp = LLVMBuildLoad(builder, tmp, "");
-                       } else {
-                               tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0,
-                                               ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
-                       }
-                       LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
-
-                       unsigned swizzle[4];
-                       int unused_stream = -1;
-                       for (unsigned stream = 0; stream < 4; ++stream) {
-                               if (!info->num_stream_output_components[stream]) {
-                                       unused_stream = stream;
-                                       break;
-                               }
-                       }
-                       for (unsigned buffer = 0; buffer < 4; ++buffer) {
-                               if (stream_for_buffer[buffer] >= 0) {
-                                       swizzle[buffer] = stream_for_buffer[buffer];
-                               } else {
-                                       assert(unused_stream >= 0);
-                                       swizzle[buffer] = unused_stream;
-                               }
-                       }
-
-                       tmp = ac_build_quad_swizzle(&ctx->ac, tmp,
-                               swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
-                       tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
-
-                       LLVMValueRef args[] = {
-                               LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
-                               tmp,
-                               ctx->ac.i32_0, // ordering
-                               ctx->ac.i32_0, // scope
-                               ctx->ac.i1false, // isVolatile
-                               LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
-                               ctx->ac.i1true, // wave release
-                               ctx->ac.i1true, // wave done
-                       };
-                       tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add",
-                                                ctx->ac.i32, args, ARRAY_SIZE(args), 0);
-
-                       /* Keep offsets in a VGPR for quick retrieval via readlane by
-                        * the first wave for bounds checking, and also store in LDS
-                        * for retrieval by all waves later. */
-                       LLVMBuildStore(builder, tmp, offsets_vgpr);
-
-                       tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac),
-                                           scratch_offset_basev, "");
-                       tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
-                       LLVMBuildStore(builder, tmp, tmp2);
-               }
-               ac_build_endif(&ctx->ac, 5210);
-
-               /* Determine the max emit per buffer. This is done via the SALU, in part
-                * because LLVM can't generate divide-by-multiply if we try to do this
-                * via VALU with one lane per buffer.
-                */
-               LLVMValueRef max_emit[4] = {};
-               for (unsigned buffer = 0; buffer < 4; ++buffer) {
-                       if (stream_for_buffer[buffer] == -1)
-                               continue;
-
-                       LLVMValueRef bufsize_dw =
-                               LLVMBuildLShr(builder,
-                                       LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""),
-                                       i32_2, "");
-
-                       tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
-                       LLVMValueRef offset_dw =
-                               ac_build_readlane(&ctx->ac, tmp,
-                                               LLVMConstInt(ctx->ac.i32, buffer, false));
-
-                       tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
-                       tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], "");
-
-                       tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
-                       max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, "");
-               }
-
-               /* Determine the number of emitted primitives per stream and fixup the
-                * GDS counter if necessary.
-                *
-                * This is complicated by the fact that a single stream can emit to
-                * multiple buffers (but luckily not vice versa).
-                */
-               LLVMValueRef emit_vgpr = ctx->ac.i32_0;
-
-               for (unsigned stream = 0; stream < 4; ++stream) {
-                       if (!info->num_stream_output_components[stream])
-                               continue;
-
-                       tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
-                       LLVMValueRef generated =
-                               ac_build_readlane(&ctx->ac, tmp,
-                                                 LLVMConstInt(ctx->ac.i32, stream, false));
-
-                       LLVMValueRef emit = generated;
-                       for (unsigned buffer = 0; buffer < 4; ++buffer) {
-                               if (stream_for_buffer[buffer] == stream)
-                                       emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
-                       }
-
-                       emit_vgpr = ac_build_writelane(&ctx->ac, emit_vgpr, emit,
-                                                      LLVMConstInt(ctx->ac.i32, stream, false));
-
-                       /* Fixup the offset using a plain GDS atomic if we overflowed. */
-                       tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
-                       ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */
-                       tmp = LLVMBuildLShr(builder,
-                                           LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
-                                           ac_get_thread_id(&ctx->ac), "");
-                       tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-                       ac_build_ifcc(&ctx->ac, tmp, 5222);
-                       {
-                               tmp = LLVMBuildSub(builder, generated, emit, "");
-                               tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
-                               tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
-                               LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
-                                                  LLVMAtomicOrderingMonotonic, false);
-                       }
-                       ac_build_endif(&ctx->ac, 5222);
-                       ac_build_endif(&ctx->ac, 5221);
-               }
-
-               tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
-               ac_build_ifcc(&ctx->ac, tmp, 5225);
-               {
-                       tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac),
-                                          scratch_emit_basev, "");
-                       tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
-                       LLVMBuildStore(builder, emit_vgpr, tmp);
-               }
-               ac_build_endif(&ctx->ac, 5225);
-       }
-       ac_build_endif(&ctx->ac, 5200);
-
-       /* Determine the workgroup-relative per-thread / primitive offset into
-        * the streamout buffers */
-       struct ac_wg_scan primemit_scan[4] = {};
-
-       if (isgs) {
-               for (unsigned stream = 0; stream < 4; ++stream) {
-                       if (!info->num_stream_output_components[stream])
-                               continue;
-
-                       primemit_scan[stream].enable_exclusive = true;
-                       primemit_scan[stream].op = nir_op_iadd;
-                       primemit_scan[stream].src = nggso->prim_enable[stream];
-                       primemit_scan[stream].scratch =
-                               ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
-                                       LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
-                       primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
-                       primemit_scan[stream].numwaves = get_tgsize(ctx);
-                       primemit_scan[stream].maxwaves = 8;
-                       ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
-               }
-       }
-
-       ac_build_s_barrier(&ctx->ac);
-
-       /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
-       LLVMValueRef wgoffset_dw[4] = {};
-
-       {
-               LLVMValueRef scratch_vgpr;
-
-               tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
-               scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
-
-               for (unsigned buffer = 0; buffer < 4; ++buffer) {
-                       if (stream_for_buffer[buffer] >= 0) {
-                               wgoffset_dw[buffer] = ac_build_readlane(
-                                       &ctx->ac, scratch_vgpr,
-                                       LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
-                       }
-               }
-
-               for (unsigned stream = 0; stream < 4; ++stream) {
-                       if (info->num_stream_output_components[stream]) {
-                               nggso->emit[stream] = ac_build_readlane(
-                                       &ctx->ac, scratch_vgpr,
-                                       LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
-                       }
-               }
-       }
-
-       /* Write out primitive data */
-       for (unsigned stream = 0; stream < 4; ++stream) {
-               if (!info->num_stream_output_components[stream])
-                       continue;
-
-               if (isgs) {
-                       ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
-               } else {
-                       primemit_scan[stream].result_exclusive = tid;
-               }
-
-               tmp = LLVMBuildICmp(builder, LLVMIntULT,
-                                   primemit_scan[stream].result_exclusive,
-                                   nggso->emit[stream], "");
-               tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], "");
-               ac_build_ifcc(&ctx->ac, tmp, 5240);
-               {
-                       LLVMValueRef offset_vtx =
-                               LLVMBuildMul(builder, primemit_scan[stream].result_exclusive,
-                                            nggso->num_vertices, "");
-
-                       for (unsigned i = 0; i < max_num_vertices; ++i) {
-                               tmp = LLVMBuildICmp(builder, LLVMIntULT,
-                                                   LLVMConstInt(ctx->ac.i32, i, false),
-                                                   nggso->num_vertices, "");
-                               ac_build_ifcc(&ctx->ac, tmp, 5241);
-                               build_streamout_vertex(ctx, so_buffer, wgoffset_dw,
-                                                      stream, offset_vtx, nggso->vertices[i]);
-                               ac_build_endif(&ctx->ac, 5241);
-                               offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
-                       }
-               }
-               ac_build_endif(&ctx->ac, 5240);
-       }
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   struct pipe_stream_output_info *so = &ctx->shader->selector->so;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   LLVMValueRef tid = get_thread_id_in_tg(ctx);
+   LLVMValueRef tmp, tmp2;
+   LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
+   LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
+   LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
+   LLVMValueRef so_buffer[4] = {};
+   unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0);
+   LLVMValueRef prim_stride_dw[4] = {};
+   LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
+   int stream_for_buffer[4] = {-1, -1, -1, -1};
+   unsigned bufmask_for_stream[4] = {};
+   bool isgs = ctx->type == PIPE_SHADER_GEOMETRY;
+   unsigned scratch_emit_base = isgs ? 4 : 0;
+   LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
+   unsigned scratch_offset_base = isgs ? 8 : 4;
+   LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
+
+   ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
+
+   /* Determine the mapping of streamout buffers to vertex streams. */
+   for (unsigned i = 0; i < so->num_outputs; ++i) {
+      unsigned buf = so->output[i].output_buffer;
+      unsigned stream = so->output[i].stream;
+      assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
+      stream_for_buffer[buf] = stream;
+      bufmask_for_stream[stream] |= 1 << buf;
+   }
+
+   for (unsigned buffer = 0; buffer < 4; ++buffer) {
+      if (stream_for_buffer[buffer] == -1)
+         continue;
+
+      assert(so->stride[buffer]);
+
+      tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false);
+      prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, "");
+      prim_stride_dw_vgpr =
+         ac_build_writelane(&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
+                            LLVMConstInt(ctx->ac.i32, buffer, false));
+
+      so_buffer[buffer] = ac_build_load_to_sgpr(
+         &ctx->ac, buf_ptr, LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false));
+   }
+
+   tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
+   ac_build_ifcc(&ctx->ac, tmp, 5200);
+   {
+      LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
+      LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
+
+      /* Advance the streamout offsets in GDS. */
+      LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+      LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+      tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5210);
+      {
+         if (isgs) {
+            tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
+            tmp = LLVMBuildLoad(builder, tmp, "");
+         } else {
+            tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
+         }
+         LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
+
+         unsigned swizzle[4];
+         int unused_stream = -1;
+         for (unsigned stream = 0; stream < 4; ++stream) {
+            if (!info->num_stream_output_components[stream]) {
+               unused_stream = stream;
+               break;
+            }
+         }
+         for (unsigned buffer = 0; buffer < 4; ++buffer) {
+            if (stream_for_buffer[buffer] >= 0) {
+               swizzle[buffer] = stream_for_buffer[buffer];
+            } else {
+               assert(unused_stream >= 0);
+               swizzle[buffer] = unused_stream;
+            }
+         }
+
+         tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+         tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
+
+         LLVMValueRef args[] = {
+            LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
+            tmp,
+            ctx->ac.i32_0,                             // ordering
+            ctx->ac.i32_0,                             // scope
+            ctx->ac.i1false,                           // isVolatile
+            LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
+            ctx->ac.i1true,                            // wave release
+            ctx->ac.i1true,                            // wave done
+         };
+         tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args,
+                                  ARRAY_SIZE(args), 0);
+
+         /* Keep offsets in a VGPR for quick retrieval via readlane by
+          * the first wave for bounds checking, and also store in LDS
+          * for retrieval by all waves later. */
+         LLVMBuildStore(builder, tmp, offsets_vgpr);
+
+         tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_offset_basev, "");
+         tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
+         LLVMBuildStore(builder, tmp, tmp2);
+      }
+      ac_build_endif(&ctx->ac, 5210);
+
+      /* Determine the max emit per buffer. This is done via the SALU, in part
+       * because LLVM can't generate divide-by-multiply if we try to do this
+       * via VALU with one lane per buffer.
+       */
+      LLVMValueRef max_emit[4] = {};
+      for (unsigned buffer = 0; buffer < 4; ++buffer) {
+         if (stream_for_buffer[buffer] == -1)
+            continue;
+
+         LLVMValueRef bufsize_dw = LLVMBuildLShr(
+            builder, LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), i32_2, "");
+
+         tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
+         LLVMValueRef offset_dw =
+            ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, buffer, false));
+
+         tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
+         tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], "");
+
+         tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
+         max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, "");
+      }
+
+      /* Determine the number of emitted primitives per stream and fixup the
+       * GDS counter if necessary.
+       *
+       * This is complicated by the fact that a single stream can emit to
+       * multiple buffers (but luckily not vice versa).
+       */
+      LLVMValueRef emit_vgpr = ctx->ac.i32_0;
+
+      for (unsigned stream = 0; stream < 4; ++stream) {
+         if (!info->num_stream_output_components[stream])
+            continue;
+
+         tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
+         LLVMValueRef generated =
+            ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, stream, false));
+
+         LLVMValueRef emit = generated;
+         for (unsigned buffer = 0; buffer < 4; ++buffer) {
+            if (stream_for_buffer[buffer] == stream)
+               emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
+         }
+
+         emit_vgpr =
+            ac_build_writelane(&ctx->ac, emit_vgpr, emit, LLVMConstInt(ctx->ac.i32, stream, false));
+
+         /* Fixup the offset using a plain GDS atomic if we overflowed. */
+         tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
+         ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */
+         tmp = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
+                             ac_get_thread_id(&ctx->ac), "");
+         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+         ac_build_ifcc(&ctx->ac, tmp, 5222);
+         {
+            tmp = LLVMBuildSub(builder, generated, emit, "");
+            tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
+            tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
+            LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
+                               LLVMAtomicOrderingMonotonic, false);
+         }
+         ac_build_endif(&ctx->ac, 5222);
+         ac_build_endif(&ctx->ac, 5221);
+      }
+
+      tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5225);
+      {
+         tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_emit_basev, "");
+         tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
+         LLVMBuildStore(builder, emit_vgpr, tmp);
+      }
+      ac_build_endif(&ctx->ac, 5225);
+   }
+   ac_build_endif(&ctx->ac, 5200);
+
+   /* Determine the workgroup-relative per-thread / primitive offset into
+    * the streamout buffers */
+   struct ac_wg_scan primemit_scan[4] = {};
+
+   if (isgs) {
+      for (unsigned stream = 0; stream < 4; ++stream) {
+         if (!info->num_stream_output_components[stream])
+            continue;
+
+         primemit_scan[stream].enable_exclusive = true;
+         primemit_scan[stream].op = nir_op_iadd;
+         primemit_scan[stream].src = nggso->prim_enable[stream];
+         primemit_scan[stream].scratch = ac_build_gep0(
+            &ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
+         primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
+         primemit_scan[stream].numwaves = get_tgsize(ctx);
+         primemit_scan[stream].maxwaves = 8;
+         ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
+      }
+   }
+
+   ac_build_s_barrier(&ctx->ac);
+
+   /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
+   LLVMValueRef wgoffset_dw[4] = {};
+
+   {
+      LLVMValueRef scratch_vgpr;
+
+      tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
+      scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
+
+      for (unsigned buffer = 0; buffer < 4; ++buffer) {
+         if (stream_for_buffer[buffer] >= 0) {
+            wgoffset_dw[buffer] =
+               ac_build_readlane(&ctx->ac, scratch_vgpr,
+                                 LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
+         }
+      }
+
+      for (unsigned stream = 0; stream < 4; ++stream) {
+         if (info->num_stream_output_components[stream]) {
+            nggso->emit[stream] =
+               ac_build_readlane(&ctx->ac, scratch_vgpr,
+                                 LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
+         }
+      }
+   }
+
+   /* Write out primitive data */
+   for (unsigned stream = 0; stream < 4; ++stream) {
+      if (!info->num_stream_output_components[stream])
+         continue;
+
+      if (isgs) {
+         ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
+      } else {
+         primemit_scan[stream].result_exclusive = tid;
+      }
+
+      tmp = LLVMBuildICmp(builder, LLVMIntULT, primemit_scan[stream].result_exclusive,
+                          nggso->emit[stream], "");
+      tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], "");
+      ac_build_ifcc(&ctx->ac, tmp, 5240);
+      {
+         LLVMValueRef offset_vtx =
+            LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, nggso->num_vertices, "");
+
+         for (unsigned i = 0; i < max_num_vertices; ++i) {
+            tmp = LLVMBuildICmp(builder, LLVMIntULT, LLVMConstInt(ctx->ac.i32, i, false),
+                                nggso->num_vertices, "");
+            ac_build_ifcc(&ctx->ac, tmp, 5241);
+            build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx,
+                                   nggso->vertices[i]);
+            ac_build_endif(&ctx->ac, 5241);
+            offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
+         }
+      }
+      ac_build_endif(&ctx->ac, 5240);
+   }
  }
  
  /* LDS layout of ES vertex data for NGG culling. */
-enum {
-       /* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old
-        *         ES thread ID. After vertex compaction, compacted ES threads
-        *         store the old thread ID here to copy input VGPRs from uncompacted
-        *         ES threads.
-        * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
-        * Byte 2: TES rel patch ID
-        * Byte 3: Unused
-        */
-       lds_byte0_accept_flag = 0,
-       lds_byte0_old_thread_id = 0,
-       lds_byte1_new_thread_id,
-       lds_byte2_tes_rel_patch_id,
-       lds_byte3_unused,
-
-       lds_packed_data = 0, /* lds_byteN_... */
-
-       lds_pos_x,
-       lds_pos_y,
-       lds_pos_z,
-       lds_pos_w,
-       lds_pos_x_div_w,
-       lds_pos_y_div_w,
-       /* If VS: */
-       lds_vertex_id,
-       lds_instance_id, /* optional */
-       /* If TES: */
-       lds_tes_u = lds_vertex_id,
-       lds_tes_v = lds_instance_id,
-       lds_tes_patch_id, /* optional */
+enum
+{
+   /* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old
+    *         ES thread ID. After vertex compaction, compacted ES threads
+    *         store the old thread ID here to copy input VGPRs from uncompacted
+    *         ES threads.
+    * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
+    * Byte 2: TES rel patch ID
+    * Byte 3: Unused
+    */
+   lds_byte0_accept_flag = 0,
+   lds_byte0_old_thread_id = 0,
+   lds_byte1_new_thread_id,
+   lds_byte2_tes_rel_patch_id,
+   lds_byte3_unused,
+
+   lds_packed_data = 0, /* lds_byteN_... */
+
+   lds_pos_x,
+   lds_pos_y,
+   lds_pos_z,
+   lds_pos_w,
+   lds_pos_x_div_w,
+   lds_pos_y_div_w,
+   /* If VS: */
+   lds_vertex_id,
+   lds_instance_id, /* optional */
+   /* If TES: */
+   lds_tes_u = lds_vertex_id,
+   lds_tes_v = lds_instance_id,
+   lds_tes_patch_id, /* optional */
  };
  
-static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx,
-                                   LLVMValueRef ptr, unsigned byte_index)
+static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx, LLVMValueRef ptr,
+                                    unsigned byte_index)
  {
-       assert(byte_index < 4);
-       LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
-       LLVMValueRef index = LLVMConstInt(ctx->ac.i32, byte_index, 0);
+   assert(byte_index < 4);
+   LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
+   LLVMValueRef index = LLVMConstInt(ctx->ac.i32, byte_index, 0);
  
-       return LLVMBuildGEP(ctx->ac.builder,
-                           LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""),
-                           &index, 1, "");
+   return LLVMBuildGEP(ctx->ac.builder, LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""), &index,
+                       1, "");
  }
  
  static unsigned ngg_nogs_vertex_size(struct si_shader *shader)
  {
-       unsigned lds_vertex_size = 0;
-
-       /* The edgeflag is always stored in the last element that's also
-        * used for padding to reduce LDS bank conflicts. */
-       if (shader->selector->so.num_outputs)
-               lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
-       if (shader->selector->info.writes_edgeflag)
-               lds_vertex_size = MAX2(lds_vertex_size, 1);
-
-       /* LDS size for passing data from GS to ES.
-        * GS stores Primitive IDs into LDS at the address corresponding
-        * to the ES thread of the provoking vertex. All ES threads
-        * load and export PrimitiveID for their thread.
-        */
-       if (shader->selector->type == PIPE_SHADER_VERTEX &&
-           shader->key.mono.u.vs_export_prim_id)
-               lds_vertex_size = MAX2(lds_vertex_size, 1);
-
-       if (shader->key.opt.ngg_culling) {
-               if (shader->selector->type == PIPE_SHADER_VERTEX) {
-                       STATIC_ASSERT(lds_instance_id + 1 == 9);
-                       lds_vertex_size = MAX2(lds_vertex_size, 9);
-               } else {
-                       assert(shader->selector->type == PIPE_SHADER_TESS_EVAL);
-
-                       if (shader->selector->info.uses_primid ||
-                           shader->key.mono.u.vs_export_prim_id) {
-                               STATIC_ASSERT(lds_tes_patch_id + 2 == 11);
-                               lds_vertex_size = MAX2(lds_vertex_size, 11);
-                       } else {
-                               STATIC_ASSERT(lds_tes_v + 1 == 9);
-                               lds_vertex_size = MAX2(lds_vertex_size, 9);
-                       }
-               }
-       }
-
-       return lds_vertex_size;
+   unsigned lds_vertex_size = 0;
+
+   /* The edgeflag is always stored in the last element that's also
+    * used for padding to reduce LDS bank conflicts. */
+   if (shader->selector->so.num_outputs)
+      lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
+   if (shader->selector->info.writes_edgeflag)
+      lds_vertex_size = MAX2(lds_vertex_size, 1);
+
+   /* LDS size for passing data from GS to ES.
+    * GS stores Primitive IDs into LDS at the address corresponding
+    * to the ES thread of the provoking vertex. All ES threads
+    * load and export PrimitiveID for their thread.
+    */
+   if (shader->selector->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)
+      lds_vertex_size = MAX2(lds_vertex_size, 1);
+
+   if (shader->key.opt.ngg_culling) {
+      if (shader->selector->type == PIPE_SHADER_VERTEX) {
+         STATIC_ASSERT(lds_instance_id + 1 == 9);
+         lds_vertex_size = MAX2(lds_vertex_size, 9);
+      } else {
+         assert(shader->selector->type == PIPE_SHADER_TESS_EVAL);
+
+         if (shader->selector->info.uses_primid || shader->key.mono.u.vs_export_prim_id) {
+            STATIC_ASSERT(lds_tes_patch_id + 2 == 11);
+            lds_vertex_size = MAX2(lds_vertex_size, 11);
+         } else {
+            STATIC_ASSERT(lds_tes_v + 1 == 9);
+            lds_vertex_size = MAX2(lds_vertex_size, 9);
+         }
+      }
+   }
+
+   return lds_vertex_size;
  }
  
  /**
   * Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
   * for the vertex outputs.
   */
-static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx,
-                                       LLVMValueRef vtxid)
+static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vtxid)
  {
-       /* The extra dword is used to avoid LDS bank conflicts. */
-       unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader);
-       LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
-       LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
-       LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
-       return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
+   /* The extra dword is used to avoid LDS bank conflicts. */
+   unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader);
+   LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
+   LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
+   LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
+   return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
  }
  
-static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx,
-                                         LLVMValueRef ret, struct ac_arg param,
-                                         unsigned return_index)
+static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx, LLVMValueRef ret,
+                                          struct ac_arg param, unsigned return_index)
  {
-       LLVMValueRef v = ac_get_arg(&ctx->ac, param);
-
-       for (unsigned i = 0; i < 4; i++) {
-               ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                          ac_llvm_extract_elem(&ctx->ac, v, i),
-                                          return_index + i, "");
-       }
-       return ret;
+   LLVMValueRef v = ac_get_arg(&ctx->ac, param);
+
+   for (unsigned i = 0; i < 4; i++) {
+      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, ac_llvm_extract_elem(&ctx->ac, v, i),
+                                 return_index + i, "");
+   }
+   return ret;
  }
  
-static void load_bitmasks_2x64(struct si_shader_context *ctx,
-                              LLVMValueRef lds_ptr, unsigned dw_offset,
-                              LLVMValueRef mask[2], LLVMValueRef *total_bitcount)
+static void load_bitmasks_2x64(struct si_shader_context *ctx, LLVMValueRef lds_ptr,
+                               unsigned dw_offset, LLVMValueRef mask[2],
+                               LLVMValueRef *total_bitcount)
  {
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef ptr64 = LLVMBuildPointerCast(builder, lds_ptr,
-                                                 LLVMPointerType(LLVMArrayType(ctx->ac.i64, 2),
-                                                                 AC_ADDR_SPACE_LDS), "");
-       for (unsigned i = 0; i < 2; i++) {
-               LLVMValueRef index = LLVMConstInt(ctx->ac.i32, dw_offset / 2 + i, 0);
-               mask[i] = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ptr64, index), "");
-       }
-
-       /* We get better code if we don't use the 128-bit bitcount. */
-       *total_bitcount = LLVMBuildAdd(builder, ac_build_bit_count(&ctx->ac, mask[0]),
-                                      ac_build_bit_count(&ctx->ac, mask[1]), "");
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef ptr64 = LLVMBuildPointerCast(
+      builder, lds_ptr, LLVMPointerType(LLVMArrayType(ctx->ac.i64, 2), AC_ADDR_SPACE_LDS), "");
+   for (unsigned i = 0; i < 2; i++) {
+      LLVMValueRef index = LLVMConstInt(ctx->ac.i32, dw_offset / 2 + i, 0);
+      mask[i] = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ptr64, index), "");
+   }
+
+   /* We get better code if we don't use the 128-bit bitcount. */
+   *total_bitcount = LLVMBuildAdd(builder, ac_build_bit_count(&ctx->ac, mask[0]),
+                                  ac_build_bit_count(&ctx->ac, mask[1]), "");
  }
  
  /**
@@ -711,38 +678,33 @@ static void load_bitmasks_2x64(struct si_shader_context *ctx,
   * \param wave_info_num_bits the bit size of thread count field in merged_wave_info
   * \param wave_info_shift    the bit offset of the thread count field in merged_wave_info
   */
-static void update_thread_counts(struct si_shader_context *ctx,
-                                LLVMValueRef *new_num_threads,
-                                LLVMValueRef *tg_info,
-                                unsigned tg_info_num_bits,
-                                unsigned tg_info_shift,
-                                LLVMValueRef *wave_info,
-                                unsigned wave_info_num_bits,
-                                unsigned wave_info_shift)
+static void update_thread_counts(struct si_shader_context *ctx, LLVMValueRef *new_num_threads,
+                                 LLVMValueRef *tg_info, unsigned tg_info_num_bits,
+                                 unsigned tg_info_shift, LLVMValueRef *wave_info,
+                                 unsigned wave_info_num_bits, unsigned wave_info_shift)
  {
-       LLVMBuilderRef builder = ctx->ac.builder;
-
-       /* Update the total thread count. */
-       unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift);
-       *tg_info = LLVMBuildAnd(builder, *tg_info,
-                               LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), "");
-       *tg_info = LLVMBuildOr(builder, *tg_info,
-                              LLVMBuildShl(builder, *new_num_threads,
-                                           LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), "");
-
-       /* Update the per-wave thread count. */
-       LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
-                                                LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), "");
-       *new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, "");
-       *new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0);
-       *new_num_threads = ac_build_imin(&ctx->ac, *new_num_threads,
-                                       LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0));
-       unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift);
-       *wave_info = LLVMBuildAnd(builder, *wave_info,
-                                 LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), "");
-       *wave_info = LLVMBuildOr(builder, *wave_info,
-                                LLVMBuildShl(builder, *new_num_threads,
-                                             LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""), "");
+   LLVMBuilderRef builder = ctx->ac.builder;
+
+   /* Update the total thread count. */
+   unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift);
+   *tg_info = LLVMBuildAnd(builder, *tg_info, LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), "");
+   *tg_info = LLVMBuildOr(
+      builder, *tg_info,
+      LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), "");
+
+   /* Update the per-wave thread count. */
+   LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
+                                            LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), "");
+   *new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, "");
+   *new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0);
+   *new_num_threads =
+      ac_build_imin(&ctx->ac, *new_num_threads, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0));
+   unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift);
+   *wave_info = LLVMBuildAnd(builder, *wave_info, LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), "");
+   *wave_info = LLVMBuildOr(
+      builder, *wave_info,
+      LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""),
+      "");
  }
  
  /**
@@ -751,759 +713,719 @@ static void update_thread_counts(struct si_shader_context *ctx,
   * Also return the position, which is passed to the shader as an input,
   * so that we don't compute it twice.
   */
-void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
-                                              unsigned max_outputs,
-                                              LLVMValueRef *addrs)
+void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs,
+                                               LLVMValueRef *addrs)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader *shader = ctx->shader;
-       struct si_shader_selector *sel = shader->selector;
-       struct si_shader_info *info = &sel->info;
-       LLVMBuilderRef builder = ctx->ac.builder;
-
-       assert(shader->key.opt.ngg_culling);
-       assert(shader->key.as_ngg);
-       assert(sel->type == PIPE_SHADER_VERTEX ||
-              (sel->type == PIPE_SHADER_TESS_EVAL && !shader->key.as_es));
-
-       LLVMValueRef position[4] = {};
-       for (unsigned i = 0; i < info->num_outputs; i++) {
-               switch (info->output_semantic_name[i]) {
-               case TGSI_SEMANTIC_POSITION:
-                       for (unsigned j = 0; j < 4; j++) {
-                               position[j] = LLVMBuildLoad(ctx->ac.builder,
-                                                           addrs[4 * i + j], "");
-                       }
-                       break;
-               }
-       }
-       assert(position[0]);
-
-       /* Store Position.XYZW into LDS. */
-       LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
-       for (unsigned chan = 0; chan < 4; chan++) {
-               LLVMBuildStore(builder, ac_to_integer(&ctx->ac, position[chan]),
-                               ac_build_gep0(&ctx->ac, es_vtxptr,
-                                             LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
-       }
-       /* Store Position.XY / W into LDS. */
-       for (unsigned chan = 0; chan < 2; chan++) {
-               LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
-               LLVMBuildStore(builder, ac_to_integer(&ctx->ac, val),
-                               ac_build_gep0(&ctx->ac, es_vtxptr,
-                                             LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0)));
-       }
-
-       /* Store VertexID and InstanceID. ES threads will have to load them
-        * from LDS after vertex compaction and use them instead of their own
-        * system values.
-        */
-       bool uses_instance_id = false;
-       bool uses_tes_prim_id = false;
-       LLVMValueRef packed_data = ctx->ac.i32_0;
-
-       if (ctx->type == PIPE_SHADER_VERTEX) {
-               uses_instance_id = sel->info.uses_instanceid ||
-                                  shader->key.part.vs.prolog.instance_divisor_is_one ||
-                                  shader->key.part.vs.prolog.instance_divisor_is_fetched;
-
-               LLVMBuildStore(builder, ctx->abi.vertex_id,
-                              ac_build_gep0(&ctx->ac, es_vtxptr,
-                                            LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)));
-               if (uses_instance_id) {
-                       LLVMBuildStore(builder, ctx->abi.instance_id,
-                                      ac_build_gep0(&ctx->ac, es_vtxptr,
-                                                    LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)));
-               }
-       } else {
-               uses_tes_prim_id = sel->info.uses_primid ||
-                                  shader->key.mono.u.vs_export_prim_id;
-
-               assert(ctx->type == PIPE_SHADER_TESS_EVAL);
-               LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)),
-                              ac_build_gep0(&ctx->ac, es_vtxptr,
-                                            LLVMConstInt(ctx->ac.i32, lds_tes_u, 0)));
-               LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)),
-                              ac_build_gep0(&ctx->ac, es_vtxptr,
-                                            LLVMConstInt(ctx->ac.i32, lds_tes_v, 0)));
-               packed_data = LLVMBuildShl(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id),
-                                          LLVMConstInt(ctx->ac.i32, lds_byte2_tes_rel_patch_id * 8, 0), "");
-               if (uses_tes_prim_id) {
-                       LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
-                                      ac_build_gep0(&ctx->ac, es_vtxptr,
-                                                    LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)));
-               }
-       }
-       /* Initialize the packed data. */
-       LLVMBuildStore(builder, packed_data,
-                      ac_build_gep0(&ctx->ac, es_vtxptr,
-                                    LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
-       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-
-       LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
-
-       /* Initialize the last 3 gs_ngg_scratch dwords to 0, because we may have less
-        * than 4 waves, but we always read all 4 values. This is where the thread
-        * bitmasks of unculled threads will be stored.
-        *
-        * gs_ngg_scratch layout: esmask[0..3]
-        */
-       ac_build_ifcc(&ctx->ac,
-                     LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx),
-                                   LLVMConstInt(ctx->ac.i32, 3, 0), ""), 16101);
-       {
-               LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->ac.i32_1, "");
-               LLVMBuildStore(builder, ctx->ac.i32_0,
-                              ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, index));
-       }
-       ac_build_endif(&ctx->ac, 16101);
-       ac_build_s_barrier(&ctx->ac);
-
-       /* The hardware requires that there are no holes between unculled vertices,
-        * which means we have to pack ES threads, i.e. reduce the ES thread count
-        * and move ES input VGPRs to lower threads. The upside is that varyings
-        * are only fetched and computed for unculled vertices.
-        *
-        * Vertex compaction in GS threads:
-        *
-        * Part 1: Compute the surviving vertex mask in GS threads:
-        * - Compute 4 32-bit surviving vertex masks in LDS. (max 4 waves)
-        *   - In GS, notify ES threads whether the vertex survived.
-        *   - Barrier
-        *   - ES threads will create the mask and store it in LDS.
-        * - Barrier
-        * - Each GS thread loads the vertex masks from LDS.
-        *
-        * Part 2: Compact ES threads in GS threads:
-        * - Compute the prefix sum for all 3 vertices from the masks. These are the new
-        *   thread IDs for each vertex within the primitive.
-        * - Write the value of the old thread ID into the LDS address of the new thread ID.
-        *   The ES thread will load the old thread ID and use it to load the position, VertexID,
-        *   and InstanceID.
-        * - Update vertex indices and null flag in the GS input VGPRs.
-        * - Barrier
-        *
-        * Part 3: Update inputs GPRs
-        * - For all waves, update per-wave thread counts in input SGPRs.
-        * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
-        */
-
-       LLVMValueRef vtxindex[3];
-       if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
-               /* For the GS fast launch, the VS prologs simply puts the Vertex IDs
-                * into these VGPRs.
-                */
-               vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
-               vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
-               vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
-       } else {
-               vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
-               vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
-               vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
-       };
-       LLVMValueRef gs_vtxptr[] = {
-               ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
-               ngg_nogs_vertex_ptr(ctx, vtxindex[1]),
-               ngg_nogs_vertex_ptr(ctx, vtxindex[2]),
-       };
-       es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
-
-       LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
-
-       /* Do culling in GS threads. */
-       ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002);
-       {
-               /* Load positions. */
-               LLVMValueRef pos[3][4] = {};
-               for (unsigned vtx = 0; vtx < 3; vtx++) {
-                       for (unsigned chan = 0; chan < 4; chan++) {
-                               unsigned index;
-                               if (chan == 0 || chan == 1)
-                                       index = lds_pos_x_div_w + chan;
-                               else if (chan == 3)
-                                       index = lds_pos_w;
-                               else
-                                       continue;
-
-                               LLVMValueRef addr = ac_build_gep0(&ctx->ac, gs_vtxptr[vtx],
-                                                                 LLVMConstInt(ctx->ac.i32, index, 0));
-                               pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
-                               pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
-                       }
-               }
-
-               /* Load the viewport state for small prim culling. */
-               LLVMValueRef vp = ac_build_load_invariant(&ctx->ac,
-                                                         ac_get_arg(&ctx->ac, ctx->small_prim_cull_info),
-                                                         ctx->ac.i32_0);
-               vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
-               LLVMValueRef vp_scale[2], vp_translate[2];
-               vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
-               vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
-               vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
-               vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
-
-               /* Get the small prim filter precision. */
-               LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4);
-               small_prim_precision = LLVMBuildOr(builder, small_prim_precision,
-                                                  LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
-               small_prim_precision = LLVMBuildShl(builder, small_prim_precision,
-                                                   LLVMConstInt(ctx->ac.i32, 23, 0), "");
-               small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
-
-               /* Execute culling code. */
-               struct ac_cull_options options = {};
-               options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
-               options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
-               options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS;
-               options.cull_small_prims = options.cull_view_xy;
-               options.cull_zero_area = options.cull_front || options.cull_back;
-               options.cull_w = true;
-
-               /* Tell ES threads whether their vertex survived. */
-               ac_build_ifcc(&ctx->ac, ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true,
-                                                        vp_scale, vp_translate,
-                                                        small_prim_precision, &options), 16003);
-               {
-                       LLVMBuildStore(builder, ctx->ac.i32_1, gs_accepted);
-                       for (unsigned vtx = 0; vtx < 3; vtx++) {
-                               LLVMBuildStore(builder, ctx->ac.i8_1,
-                                              si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag));
-                       }
-               }
-               ac_build_endif(&ctx->ac, 16003);
-       }
-       ac_build_endif(&ctx->ac, 16002);
-       ac_build_s_barrier(&ctx->ac);
-
-       gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
-
-       LLVMValueRef es_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, "");
-
-       /* Convert the per-vertex flag to a thread bitmask in ES threads and store it in LDS. */
-       ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007);
-       {
-               LLVMValueRef es_accepted_flag =
-                       LLVMBuildLoad(builder,
-                                     si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), "");
-
-               LLVMValueRef es_accepted_bool = LLVMBuildICmp(builder, LLVMIntNE,
-                                                             es_accepted_flag, ctx->ac.i8_0, "");
-               LLVMValueRef es_mask = ac_get_i1_sgpr_mask(&ctx->ac, es_accepted_bool);
-
-               LLVMBuildStore(builder, es_accepted_bool, es_accepted);
-
-               ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ,
-                                                     tid, ctx->ac.i32_0, ""), 16008);
-               {
-                       LLVMBuildStore(builder, es_mask,
-                                      ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
-                                                    get_wave_id_in_tg(ctx)));
-               }
-               ac_build_endif(&ctx->ac, 16008);
-       }
-       ac_build_endif(&ctx->ac, 16007);
-       ac_build_s_barrier(&ctx->ac);
-
-       /* Load the vertex masks and compute the new ES thread count. */
-       LLVMValueRef es_mask[2], new_num_es_threads, kill_wave;
-       load_bitmasks_2x64(ctx, ctx->gs_ngg_scratch, 0, es_mask, &new_num_es_threads);
-       new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL);
-
-       /* ES threads compute their prefix sum, which is the new ES thread ID.
-        * Then they write the value of the old thread ID into the LDS address
-        * of the new thread ID. It will be used it to load input VGPRs from
-        * the old thread's LDS location.
-        */
-       ac_build_ifcc(&ctx->ac, LLVMBuildLoad(builder, es_accepted, ""), 16009);
-       {
-               LLVMValueRef old_id = get_thread_id_in_tg(ctx);
-               LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id);
-
-               LLVMBuildStore(builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""),
-                              si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id),
-                                              lds_byte0_old_thread_id));
-               LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
-                              si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
-       }
-       ac_build_endif(&ctx->ac, 16009);
-
-       /* Kill waves that have inactive threads. */
-       kill_wave = LLVMBuildICmp(builder, LLVMIntULE,
-                                 ac_build_imax(&ctx->ac, new_num_es_threads, ngg_get_prim_cnt(ctx)),
-                                 LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
-                                              LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""), "");
-       ac_build_ifcc(&ctx->ac, kill_wave, 19202);
-       {
-               /* If we are killing wave 0, send that there are no primitives
-                * in this threadgroup.
-                */
-               ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
-                                             ctx->ac.i32_0, ctx->ac.i32_0);
-               ac_build_s_endpgm(&ctx->ac);
-       }
-       ac_build_endif(&ctx->ac, 19202);
-       ac_build_s_barrier(&ctx->ac);
-
-       /* Send the final vertex and primitive counts. */
-       ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
-                                     new_num_es_threads, ngg_get_prim_cnt(ctx));
-
-       /* Update thread counts in SGPRs. */
-       LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->gs_tg_info);
-       LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->merged_wave_info);
-
-       /* This also converts the thread count from the total count to the per-wave count. */
-       update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12,
-                            &new_merged_wave_info, 8, 0);
-
-       /* Update vertex indices in VGPR0 (same format as NGG passthrough). */
-       LLVMValueRef new_vgpr0 = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
-       /* Set the null flag at the beginning (culled), and then
-        * overwrite it for accepted primitives.
-        */
-       LLVMBuildStore(builder, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), new_vgpr0);
-
-       /* Get vertex indices after vertex compaction. */
-       ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011);
-       {
-               struct ac_ngg_prim prim = {};
-               prim.num_vertices = 3;
-               prim.isnull = ctx->ac.i1false;
-
-               for (unsigned vtx = 0; vtx < 3; vtx++) {
-                       prim.index[vtx] =
-                               LLVMBuildLoad(builder,
-                                             si_build_gep_i8(ctx, gs_vtxptr[vtx],
-                                                             lds_byte1_new_thread_id), "");
-                       prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, "");
-                       prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx);
-               }
-
-               /* Set the new GS input VGPR. */
-               LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0);
-       }
-       ac_build_endif(&ctx->ac, 16011);
-
-       if (gfx10_ngg_export_prim_early(shader))
-               gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, ""));
-
-       /* Set the new ES input VGPRs. */
-       LLVMValueRef es_data[4];
-       LLVMValueRef old_thread_id = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
-       for (unsigned i = 0; i < 4; i++)
-               es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
-       ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid,
-                                             new_num_es_threads, ""), 16012);
-       {
-               LLVMValueRef old_id, old_es_vtxptr, tmp;
-
-               /* Load ES input VGPRs from the ES thread before compaction. */
-               old_id = LLVMBuildLoad(builder,
-                                      si_build_gep_i8(ctx, es_vtxptr, lds_byte0_old_thread_id), "");
-               old_id = LLVMBuildZExt(builder, old_id, ctx->ac.i32, "");
-
-               LLVMBuildStore(builder, old_id, old_thread_id);
-               old_es_vtxptr = ngg_nogs_vertex_ptr(ctx, old_id);
-
-               for (unsigned i = 0; i < 2; i++) {
-                       tmp = LLVMBuildLoad(builder,
-                                           ac_build_gep0(&ctx->ac, old_es_vtxptr,
-                                                         LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)), "");
-                       LLVMBuildStore(builder, tmp, es_data[i]);
-               }
-
-               if (ctx->type == PIPE_SHADER_TESS_EVAL) {
-                       tmp = LLVMBuildLoad(builder,
-                                           si_build_gep_i8(ctx, old_es_vtxptr,
-                                                           lds_byte2_tes_rel_patch_id), "");
-                       tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
-                       LLVMBuildStore(builder, tmp, es_data[2]);
-
-                       if (uses_tes_prim_id) {
-                               tmp = LLVMBuildLoad(builder,
-                                                   ac_build_gep0(&ctx->ac, old_es_vtxptr,
-                                                                 LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)), "");
-                               LLVMBuildStore(builder, tmp, es_data[3]);
-                       }
-               }
-       }
-       ac_build_endif(&ctx->ac, 16012);
-
-       /* Return values for the main function. */
-       LLVMValueRef ret = ctx->return_value;
-       LLVMValueRef val;
-
-       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, "");
-       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, "");
-       if (ctx->type == PIPE_SHADER_TESS_EVAL)
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 4);
-
-       ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
-                                 8 + SI_SGPR_RW_BUFFERS);
-       ret = si_insert_input_ptr(ctx, ret,
-                                 ctx->bindless_samplers_and_images,
-                                 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
-       ret = si_insert_input_ptr(ctx, ret,
-                                 ctx->const_and_shader_buffers,
-                                 8 + SI_SGPR_CONST_AND_SHADER_BUFFERS);
-       ret = si_insert_input_ptr(ctx, ret,
-                                 ctx->samplers_and_images,
-                                 8 + SI_SGPR_SAMPLERS_AND_IMAGES);
-       ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits,
-                                 8 + SI_SGPR_VS_STATE_BITS);
-
-       if (ctx->type == PIPE_SHADER_VERTEX) {
-               ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex,
-                                         8 + SI_SGPR_BASE_VERTEX);
-               ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance,
-                                         8 + SI_SGPR_START_INSTANCE);
-               ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id,
-                                         8 + SI_SGPR_DRAWID);
-               ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers,
-                                         8 + SI_VS_NUM_USER_SGPR);
-
-               for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
-                       ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
-                                                   8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
-               }
-       } else {
-               assert(ctx->type == PIPE_SHADER_TESS_EVAL);
-               ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout,
-                                         8 + SI_SGPR_TES_OFFCHIP_LAYOUT);
-               ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr,
-                                         8 + SI_SGPR_TES_OFFCHIP_ADDR);
-       }
-
-       unsigned vgpr;
-       if (ctx->type == PIPE_SHADER_VERTEX) {
-               if (shader->selector->num_vbos_in_user_sgprs) {
-                       vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
-                              shader->selector->num_vbos_in_user_sgprs * 4;
-               } else {
-                       vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
-               }
-       } else {
-               vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
-       }
-
-       val = LLVMBuildLoad(builder, new_vgpr0, "");
-       ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
-                                  vgpr++, "");
-       vgpr++; /* gs_vtx23_offset */
-
-       ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
-       ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
-       vgpr++; /* gs_vtx45_offset */
-
-       if (ctx->type == PIPE_SHADER_VERTEX) {
-               val = LLVMBuildLoad(builder, es_data[0], "");
-               ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
-                                          vgpr++, ""); /* VGPR5 - VertexID */
-               vgpr += 2;
-               if (uses_instance_id) {
-                       val = LLVMBuildLoad(builder, es_data[1], "");
-                       ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
-                                                  vgpr++, ""); /* VGPR8 - InstanceID */
-               } else {
-                       vgpr++;
-               }
-       } else {
-               assert(ctx->type == PIPE_SHADER_TESS_EVAL);
-               unsigned num_vgprs = uses_tes_prim_id ? 4 : 3;
-               for (unsigned i = 0; i < num_vgprs; i++) {
-                       val = LLVMBuildLoad(builder, es_data[i], "");
-                       ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
-                                                  vgpr++, "");
-               }
-               if (num_vgprs == 3)
-                       vgpr++;
-       }
-       /* Return the old thread ID. */
-       val = LLVMBuildLoad(builder, old_thread_id, "");
-       ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
-
-       /* These two also use LDS. */
-       if (sel->info.writes_edgeflag ||
-           (ctx->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
-               ac_build_s_barrier(&ctx->ac);
-
-       ctx->return_value = ret;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader *shader = ctx->shader;
+   struct si_shader_selector *sel = shader->selector;
+   struct si_shader_info *info = &sel->info;
+   LLVMBuilderRef builder = ctx->ac.builder;
+
+   assert(shader->key.opt.ngg_culling);
+   assert(shader->key.as_ngg);
+   assert(sel->type == PIPE_SHADER_VERTEX ||
+          (sel->type == PIPE_SHADER_TESS_EVAL && !shader->key.as_es));
+
+   LLVMValueRef position[4] = {};
+   for (unsigned i = 0; i < info->num_outputs; i++) {
+      switch (info->output_semantic_name[i]) {
+      case TGSI_SEMANTIC_POSITION:
+         for (unsigned j = 0; j < 4; j++) {
+            position[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
+         }
+         break;
+      }
+   }
+   assert(position[0]);
+
+   /* Store Position.XYZW into LDS. */
+   LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+   for (unsigned chan = 0; chan < 4; chan++) {
+      LLVMBuildStore(
+         builder, ac_to_integer(&ctx->ac, position[chan]),
+         ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
+   }
+   /* Store Position.XY / W into LDS. */
+   for (unsigned chan = 0; chan < 2; chan++) {
+      LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
+      LLVMBuildStore(
+         builder, ac_to_integer(&ctx->ac, val),
+         ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0)));
+   }
+
+   /* Store VertexID and InstanceID. ES threads will have to load them
+    * from LDS after vertex compaction and use them instead of their own
+    * system values.
+    */
+   bool uses_instance_id = false;
+   bool uses_tes_prim_id = false;
+   LLVMValueRef packed_data = ctx->ac.i32_0;
+
+   if (ctx->type == PIPE_SHADER_VERTEX) {
+      uses_instance_id = sel->info.uses_instanceid ||
+                         shader->key.part.vs.prolog.instance_divisor_is_one ||
+                         shader->key.part.vs.prolog.instance_divisor_is_fetched;
+
+      LLVMBuildStore(
+         builder, ctx->abi.vertex_id,
+         ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)));
+      if (uses_instance_id) {
+         LLVMBuildStore(
+            builder, ctx->abi.instance_id,
+            ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)));
+      }
+   } else {
+      uses_tes_prim_id = sel->info.uses_primid || shader->key.mono.u.vs_export_prim_id;
+
+      assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+      LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)),
+                     ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_u, 0)));
+      LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)),
+                     ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_v, 0)));
+      packed_data = LLVMBuildShl(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id),
+                                 LLVMConstInt(ctx->ac.i32, lds_byte2_tes_rel_patch_id * 8, 0), "");
+      if (uses_tes_prim_id) {
+         LLVMBuildStore(
+            builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
+            ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)));
+      }
+   }
+   /* Initialize the packed data. */
+   LLVMBuildStore(
+      builder, packed_data,
+      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
+   ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+   LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
+
+   /* Initialize the last 3 gs_ngg_scratch dwords to 0, because we may have less
+    * than 4 waves, but we always read all 4 values. This is where the thread
+    * bitmasks of unculled threads will be stored.
+    *
+    * gs_ngg_scratch layout: esmask[0..3]
+    */
+   ac_build_ifcc(&ctx->ac,
+                 LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx),
+                               LLVMConstInt(ctx->ac.i32, 3, 0), ""),
+                 16101);
+   {
+      LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->ac.i32_1, "");
+      LLVMBuildStore(builder, ctx->ac.i32_0, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, index));
+   }
+   ac_build_endif(&ctx->ac, 16101);
+   ac_build_s_barrier(&ctx->ac);
+
+   /* The hardware requires that there are no holes between unculled vertices,
+    * which means we have to pack ES threads, i.e. reduce the ES thread count
+    * and move ES input VGPRs to lower threads. The upside is that varyings
+    * are only fetched and computed for unculled vertices.
+    *
+    * Vertex compaction in GS threads:
+    *
+    * Part 1: Compute the surviving vertex mask in GS threads:
+    * - Compute 4 32-bit surviving vertex masks in LDS. (max 4 waves)
+    *   - In GS, notify ES threads whether the vertex survived.
+    *   - Barrier
+    *   - ES threads will create the mask and store it in LDS.
+    * - Barrier
+    * - Each GS thread loads the vertex masks from LDS.
+    *
+    * Part 2: Compact ES threads in GS threads:
+    * - Compute the prefix sum for all 3 vertices from the masks. These are the new
+    *   thread IDs for each vertex within the primitive.
+    * - Write the value of the old thread ID into the LDS address of the new thread ID.
+    *   The ES thread will load the old thread ID and use it to load the position, VertexID,
+    *   and InstanceID.
+    * - Update vertex indices and null flag in the GS input VGPRs.
+    * - Barrier
+    *
+    * Part 3: Update inputs GPRs
+    * - For all waves, update per-wave thread counts in input SGPRs.
+    * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
+    */
+
+   LLVMValueRef vtxindex[3];
+   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
+      /* For the GS fast launch, the VS prologs simply puts the Vertex IDs
+       * into these VGPRs.
+       */
+      vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
+      vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
+      vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
+   } else {
+      vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
+      vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
+      vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
+   };
+   LLVMValueRef gs_vtxptr[] = {
+      ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
+      ngg_nogs_vertex_ptr(ctx, vtxindex[1]),
+      ngg_nogs_vertex_ptr(ctx, vtxindex[2]),
+   };
+   es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+
+   LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+
+   /* Do culling in GS threads. */
+   ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002);
+   {
+      /* Load positions. */
+      LLVMValueRef pos[3][4] = {};
+      for (unsigned vtx = 0; vtx < 3; vtx++) {
+         for (unsigned chan = 0; chan < 4; chan++) {
+            unsigned index;
+            if (chan == 0 || chan == 1)
+               index = lds_pos_x_div_w + chan;
+            else if (chan == 3)
+               index = lds_pos_w;
+            else
+               continue;
+
+            LLVMValueRef addr =
+               ac_build_gep0(&ctx->ac, gs_vtxptr[vtx], LLVMConstInt(ctx->ac.i32, index, 0));
+            pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
+            pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
+         }
+      }
+
+      /* Load the viewport state for small prim culling. */
+      LLVMValueRef vp = ac_build_load_invariant(
+         &ctx->ac, ac_get_arg(&ctx->ac, ctx->small_prim_cull_info), ctx->ac.i32_0);
+      vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
+      LLVMValueRef vp_scale[2], vp_translate[2];
+      vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
+      vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
+      vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
+      vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
+
+      /* Get the small prim filter precision. */
+      LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4);
+      small_prim_precision =
+         LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
+      small_prim_precision =
+         LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), "");
+      small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
+
+      /* Execute culling code. */
+      struct ac_cull_options options = {};
+      options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
+      options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
+      options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS;
+      options.cull_small_prims = options.cull_view_xy;
+      options.cull_zero_area = options.cull_front || options.cull_back;
+      options.cull_w = true;
+
+      /* Tell ES threads whether their vertex survived. */
+      ac_build_ifcc(&ctx->ac,
+                    ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true, vp_scale, vp_translate,
+                                     small_prim_precision, &options),
+                    16003);
+      {
+         LLVMBuildStore(builder, ctx->ac.i32_1, gs_accepted);
+         for (unsigned vtx = 0; vtx < 3; vtx++) {
+            LLVMBuildStore(builder, ctx->ac.i8_1,
+                           si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag));
+         }
+      }
+      ac_build_endif(&ctx->ac, 16003);
+   }
+   ac_build_endif(&ctx->ac, 16002);
+   ac_build_s_barrier(&ctx->ac);
+
+   gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
+
+   LLVMValueRef es_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, "");
+
+   /* Convert the per-vertex flag to a thread bitmask in ES threads and store it in LDS. */
+   ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007);
+   {
+      LLVMValueRef es_accepted_flag =
+         LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), "");
+
+      LLVMValueRef es_accepted_bool =
+         LLVMBuildICmp(builder, LLVMIntNE, es_accepted_flag, ctx->ac.i8_0, "");
+      LLVMValueRef es_mask = ac_get_i1_sgpr_mask(&ctx->ac, es_accepted_bool);
+
+      LLVMBuildStore(builder, es_accepted_bool, es_accepted);
+
+      ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, tid, ctx->ac.i32_0, ""), 16008);
+      {
+         LLVMBuildStore(builder, es_mask,
+                        ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, get_wave_id_in_tg(ctx)));
+      }
+      ac_build_endif(&ctx->ac, 16008);
+   }
+   ac_build_endif(&ctx->ac, 16007);
+   ac_build_s_barrier(&ctx->ac);
+
+   /* Load the vertex masks and compute the new ES thread count. */
+   LLVMValueRef es_mask[2], new_num_es_threads, kill_wave;
+   load_bitmasks_2x64(ctx, ctx->gs_ngg_scratch, 0, es_mask, &new_num_es_threads);
+   new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL);
+
+   /* ES threads compute their prefix sum, which is the new ES thread ID.
+    * Then they write the value of the old thread ID into the LDS address
+    * of the new thread ID. It will be used it to load input VGPRs from
+    * the old thread's LDS location.
+    */
+   ac_build_ifcc(&ctx->ac, LLVMBuildLoad(builder, es_accepted, ""), 16009);
+   {
+      LLVMValueRef old_id = get_thread_id_in_tg(ctx);
+      LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id);
+
+      LLVMBuildStore(
+         builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""),
+         si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id), lds_byte0_old_thread_id));
+      LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
+                     si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
+   }
+   ac_build_endif(&ctx->ac, 16009);
+
+   /* Kill waves that have inactive threads. */
+   kill_wave = LLVMBuildICmp(builder, LLVMIntULE,
+                             ac_build_imax(&ctx->ac, new_num_es_threads, ngg_get_prim_cnt(ctx)),
+                             LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
+                                          LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""),
+                             "");
+   ac_build_ifcc(&ctx->ac, kill_wave, 19202);
+   {
+      /* If we are killing wave 0, send that there are no primitives
+       * in this threadgroup.
+       */
+      ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ctx->ac.i32_0);
+      ac_build_s_endpgm(&ctx->ac);
+   }
+   ac_build_endif(&ctx->ac, 19202);
+   ac_build_s_barrier(&ctx->ac);
+
+   /* Send the final vertex and primitive counts. */
+   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), new_num_es_threads,
+                                 ngg_get_prim_cnt(ctx));
+
+   /* Update thread counts in SGPRs. */
+   LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->gs_tg_info);
+   LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->merged_wave_info);
+
+   /* This also converts the thread count from the total count to the per-wave count. */
+   update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12, &new_merged_wave_info, 8,
+                        0);
+
+   /* Update vertex indices in VGPR0 (same format as NGG passthrough). */
+   LLVMValueRef new_vgpr0 = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+   /* Set the null flag at the beginning (culled), and then
+    * overwrite it for accepted primitives.
+    */
+   LLVMBuildStore(builder, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), new_vgpr0);
+
+   /* Get vertex indices after vertex compaction. */
+   ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011);
+   {
+      struct ac_ngg_prim prim = {};
+      prim.num_vertices = 3;
+      prim.isnull = ctx->ac.i1false;
+
+      for (unsigned vtx = 0; vtx < 3; vtx++) {
+         prim.index[vtx] = LLVMBuildLoad(
+            builder, si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte1_new_thread_id), "");
+         prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, "");
+         prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx);
+      }
+
+      /* Set the new GS input VGPR. */
+      LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0);
+   }
+   ac_build_endif(&ctx->ac, 16011);
+
+   if (gfx10_ngg_export_prim_early(shader))
+      gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, ""));
+
+   /* Set the new ES input VGPRs. */
+   LLVMValueRef es_data[4];
+   LLVMValueRef old_thread_id = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+   for (unsigned i = 0; i < 4; i++)
+      es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+   ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, new_num_es_threads, ""),
+                 16012);
+   {
+      LLVMValueRef old_id, old_es_vtxptr, tmp;
+
+      /* Load ES input VGPRs from the ES thread before compaction. */
+      old_id = LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_old_thread_id), "");
+      old_id = LLVMBuildZExt(builder, old_id, ctx->ac.i32, "");
+
+      LLVMBuildStore(builder, old_id, old_thread_id);
+      old_es_vtxptr = ngg_nogs_vertex_ptr(ctx, old_id);
+
+      for (unsigned i = 0; i < 2; i++) {
+         tmp = LLVMBuildLoad(
+            builder,
+            ac_build_gep0(&ctx->ac, old_es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)),
+            "");
+         LLVMBuildStore(builder, tmp, es_data[i]);
+      }
+
+      if (ctx->type == PIPE_SHADER_TESS_EVAL) {
+         tmp = LLVMBuildLoad(builder,
+                             si_build_gep_i8(ctx, old_es_vtxptr, lds_byte2_tes_rel_patch_id), "");
+         tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
+         LLVMBuildStore(builder, tmp, es_data[2]);
+
+         if (uses_tes_prim_id) {
+            tmp = LLVMBuildLoad(builder,
+                                ac_build_gep0(&ctx->ac, old_es_vtxptr,
+                                              LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)),
+                                "");
+            LLVMBuildStore(builder, tmp, es_data[3]);
+         }
+      }
+   }
+   ac_build_endif(&ctx->ac, 16012);
+
+   /* Return values for the main function. */
+   LLVMValueRef ret = ctx->return_value;
+   LLVMValueRef val;
+
+   ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, "");
+   ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, "");
+   if (ctx->type == PIPE_SHADER_TESS_EVAL)
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 4);
+
+   ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
+   ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
+                             8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
+   ret = si_insert_input_ptr(ctx, ret, ctx->const_and_shader_buffers,
+                             8 + SI_SGPR_CONST_AND_SHADER_BUFFERS);
+   ret = si_insert_input_ptr(ctx, ret, ctx->samplers_and_images, 8 + SI_SGPR_SAMPLERS_AND_IMAGES);
+   ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
+
+   if (ctx->type == PIPE_SHADER_VERTEX) {
+      ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex, 8 + SI_SGPR_BASE_VERTEX);
+      ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance, 8 + SI_SGPR_START_INSTANCE);
+      ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id, 8 + SI_SGPR_DRAWID);
+      ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers, 8 + SI_VS_NUM_USER_SGPR);
+
+      for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
+         ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
+                                     8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
+      }
+   } else {
+      assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+      ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout, 8 + SI_SGPR_TES_OFFCHIP_LAYOUT);
+      ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr, 8 + SI_SGPR_TES_OFFCHIP_ADDR);
+   }
+
+   unsigned vgpr;
+   if (ctx->type == PIPE_SHADER_VERTEX) {
+      if (shader->selector->num_vbos_in_user_sgprs) {
+         vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
+      } else {
+         vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
+      }
+   } else {
+      vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
+   }
+
+   val = LLVMBuildLoad(builder, new_vgpr0, "");
+   ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
+   vgpr++; /* gs_vtx23_offset */
+
+   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
+   vgpr++; /* gs_vtx45_offset */
+
+   if (ctx->type == PIPE_SHADER_VERTEX) {
+      val = LLVMBuildLoad(builder, es_data[0], "");
+      ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
+                                 ""); /* VGPR5 - VertexID */
+      vgpr += 2;
+      if (uses_instance_id) {
+         val = LLVMBuildLoad(builder, es_data[1], "");
+         ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
+                                    ""); /* VGPR8 - InstanceID */
+      } else {
+         vgpr++;
+      }
+   } else {
+      assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+      unsigned num_vgprs = uses_tes_prim_id ? 4 : 3;
+      for (unsigned i = 0; i < num_vgprs; i++) {
+         val = LLVMBuildLoad(builder, es_data[i], "");
+         ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
+      }
+      if (num_vgprs == 3)
+         vgpr++;
+   }
+   /* Return the old thread ID. */
+   val = LLVMBuildLoad(builder, old_thread_id, "");
+   ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
+
+   /* These two also use LDS. */
+   if (sel->info.writes_edgeflag ||
+       (ctx->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
+      ac_build_s_barrier(&ctx->ac);
+
+   ctx->return_value = ret;
  }
  
  /**
   * Emit the epilogue of an API VS or TES shader compiled as ESGS shader.
   */
-void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
-                            unsigned max_outputs,
-                            LLVMValueRef *addrs)
+void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader_selector *sel = ctx->shader->selector;
-       struct si_shader_info *info = &sel->info;
-       struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef tmp, tmp2;
-
-       assert(!ctx->shader->is_gs_copy_shader);
-       assert(info->num_outputs <= max_outputs);
-
-       LLVMValueRef vertex_ptr = NULL;
-
-       if (sel->so.num_outputs || sel->info.writes_edgeflag)
-               vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
-
-       for (unsigned i = 0; i < info->num_outputs; i++) {
-               outputs[i].semantic_name = info->output_semantic_name[i];
-               outputs[i].semantic_index = info->output_semantic_index[i];
-
-               for (unsigned j = 0; j < 4; j++) {
-                       outputs[i].vertex_stream[j] =
-                               (info->output_streams[i] >> (2 * j)) & 3;
-
-                       /* TODO: we may store more outputs than streamout needs,
-                        * but streamout performance isn't that important.
-                        */
-                       if (sel->so.num_outputs) {
-                               tmp = ac_build_gep0(&ctx->ac, vertex_ptr,
-                                       LLVMConstInt(ctx->ac.i32, 4 * i + j, false));
-                               tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], "");
-                               tmp2 = ac_to_integer(&ctx->ac, tmp2);
-                               LLVMBuildStore(builder, tmp2, tmp);
-                       }
-               }
-
-               /* Store the edgeflag at the end (if streamout is enabled) */
-               if (info->output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG &&
-                   sel->info.writes_edgeflag) {
-                       LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], "");
-                       /* The output is a float, but the hw expects a 1-bit integer. */
-                       edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, "");
-                       edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1);
-
-                       tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
-                       tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
-                       LLVMBuildStore(builder, edgeflag, tmp);
-               }
-       }
-
-       bool unterminated_es_if_block =
-               !sel->so.num_outputs &&
-               !sel->info.writes_edgeflag &&
-               !ctx->screen->use_ngg_streamout && /* no query buffer */
-               (ctx->type != PIPE_SHADER_VERTEX ||
-                !ctx->shader->key.mono.u.vs_export_prim_id);
-
-       if (!unterminated_es_if_block)
-               ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-
-       LLVMValueRef is_gs_thread = si_is_gs_thread(ctx);
-       LLVMValueRef is_es_thread = si_is_es_thread(ctx);
-       LLVMValueRef vtxindex[3];
-
-       if (ctx->shader->key.opt.ngg_culling) {
-               vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9);
-               vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9);
-               vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9);
-       } else {
-               vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
-               vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
-               vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
-       }
-
-       /* Determine the number of vertices per primitive. */
-       unsigned num_vertices;
-       LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices);
-
-       /* Streamout */
-       LLVMValueRef emitted_prims = NULL;
-
-       if (sel->so.num_outputs) {
-               assert(!unterminated_es_if_block);
-
-               struct ngg_streamout nggso = {};
-               nggso.num_vertices = num_vertices_val;
-               nggso.prim_enable[0] = is_gs_thread;
-
-               for (unsigned i = 0; i < num_vertices; ++i)
-                       nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
-
-               build_streamout(ctx, &nggso);
-               emitted_prims = nggso.emit[0];
-       }
-
-       LLVMValueRef user_edgeflags[3] = {};
-
-       if (sel->info.writes_edgeflag) {
-               assert(!unterminated_es_if_block);
-
-               /* Streamout already inserted the barrier, so don't insert it again. */
-               if (!sel->so.num_outputs)
-                       ac_build_s_barrier(&ctx->ac);
-
-               ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
-               /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
-               for (unsigned i = 0; i < num_vertices; i++) {
-                       tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
-                       tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
-                       tmp = ac_build_gep0(&ctx->ac, tmp, tmp2);
-                       tmp = LLVMBuildLoad(builder, tmp, "");
-                       tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-
-                       user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, "");
-                       LLVMBuildStore(builder, tmp, user_edgeflags[i]);
-               }
-               ac_build_endif(&ctx->ac, 5400);
-       }
-
-       /* Copy Primitive IDs from GS threads to the LDS address corresponding
-        * to the ES thread of the provoking vertex.
-        */
-       if (ctx->type == PIPE_SHADER_VERTEX &&
-           ctx->shader->key.mono.u.vs_export_prim_id) {
-               assert(!unterminated_es_if_block);
-
-               /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
-               if (sel->so.num_outputs || sel->info.writes_edgeflag)
-                       ac_build_s_barrier(&ctx->ac);
-
-               ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
-               /* Extract the PROVOKING_VTX_INDEX field. */
-               LLVMValueRef provoking_vtx_in_prim =
-                       si_unpack_param(ctx, ctx->vs_state_bits, 4, 2);
-
-               /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
-               LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
-               LLVMValueRef provoking_vtx_index =
-                       LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
-               LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index);
-
-               LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id),
-                              ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0));
-               ac_build_endif(&ctx->ac, 5400);
-       }
-
-       /* Update query buffer */
-       if (ctx->screen->use_ngg_streamout &&
-           !info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
-               assert(!unterminated_es_if_block);
-
-               tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
-               tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-               ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */
-               tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
-               ac_build_ifcc(&ctx->ac, tmp, 5030);
-               tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac),
-                                   sel->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, "");
-               ac_build_ifcc(&ctx->ac, tmp, 5031);
-               {
-                       LLVMValueRef args[] = {
-                               ngg_get_prim_cnt(ctx),
-                               ngg_get_query_buf(ctx),
-                               LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */
-                               ctx->ac.i32_0, /* soffset */
-                               ctx->ac.i32_0, /* cachepolicy */
-                       };
-
-                       if (sel->so.num_outputs) {
-                               args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1);
-                               args[2] = ac_build_writelane(&ctx->ac, args[2],
-                                               LLVMConstInt(ctx->ac.i32, 24, false), ctx->ac.i32_1);
-                       }
-
-                       /* TODO: should this be 64-bit atomics? */
-                       ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
-                                          ctx->ac.i32, args, 5, 0);
-               }
-               ac_build_endif(&ctx->ac, 5031);
-               ac_build_endif(&ctx->ac, 5030);
-               ac_build_endif(&ctx->ac, 5029);
-       }
-
-       /* Build the primitive export. */
-       if (!gfx10_ngg_export_prim_early(ctx->shader)) {
-               assert(!unterminated_es_if_block);
-               gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL);
-       }
-
-       /* Export per-vertex data (positions and parameters). */
-       if (!unterminated_es_if_block)
-               ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
-       {
-               unsigned i;
-
-               /* Unconditionally (re-)load the values for proper SSA form. */
-               for (i = 0; i < info->num_outputs; i++) {
-                       /* If the NGG cull shader part computed the position, don't
-                        * use the position from the current shader part. Instead,
-                        * load it from LDS.
-                        */
-                       if (info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
-                           ctx->shader->key.opt.ngg_culling) {
-                               vertex_ptr = ngg_nogs_vertex_ptr(ctx,
-                                               ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id));
-
-                               for (unsigned j = 0; j < 4; j++) {
-                                       tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0);
-                                       tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
-                                       tmp = LLVMBuildLoad(builder, tmp, "");
-                                       outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
-                               }
-                       } else {
-                               for (unsigned j = 0; j < 4; j++) {
-                                       outputs[i].values[j] =
-                                               LLVMBuildLoad(builder,
-                                                             addrs[4 * i + j], "");
-                               }
-                       }
-               }
-
-               if (ctx->shader->key.mono.u.vs_export_prim_id) {
-                       outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
-                       outputs[i].semantic_index = 0;
-
-                       if (ctx->type == PIPE_SHADER_VERTEX) {
-                               /* Wait for GS stores to finish. */
-                               ac_build_s_barrier(&ctx->ac);
-
-                               tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
-                               tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
-                               outputs[i].values[0] = LLVMBuildLoad(builder, tmp, "");
-                       } else {
-                               assert(ctx->type == PIPE_SHADER_TESS_EVAL);
-                               outputs[i].values[0] = si_get_primitive_id(ctx, 0);
-                       }
-
-                       outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]);
-                       for (unsigned j = 1; j < 4; j++)
-                               outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32);
-
-                       memset(outputs[i].vertex_stream, 0,
-                              sizeof(outputs[i].vertex_stream));
-                       i++;
-               }
-
-               si_llvm_build_vs_exports(ctx, outputs, i);
-       }
-       ac_build_endif(&ctx->ac, 6002);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_selector *sel = ctx->shader->selector;
+   struct si_shader_info *info = &sel->info;
+   struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef tmp, tmp2;
+
+   assert(!ctx->shader->is_gs_copy_shader);
+   assert(info->num_outputs <= max_outputs);
+
+   LLVMValueRef vertex_ptr = NULL;
+
+   if (sel->so.num_outputs || sel->info.writes_edgeflag)
+      vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+
+   for (unsigned i = 0; i < info->num_outputs; i++) {
+      outputs[i].semantic_name = info->output_semantic_name[i];
+      outputs[i].semantic_index = info->output_semantic_index[i];
+
+      for (unsigned j = 0; j < 4; j++) {
+         outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
+
+         /* TODO: we may store more outputs than streamout needs,
+          * but streamout performance isn't that important.
+          */
+         if (sel->so.num_outputs) {
+            tmp = ac_build_gep0(&ctx->ac, vertex_ptr, LLVMConstInt(ctx->ac.i32, 4 * i + j, false));
+            tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], "");
+            tmp2 = ac_to_integer(&ctx->ac, tmp2);
+            LLVMBuildStore(builder, tmp2, tmp);
+         }
+      }
+
+      /* Store the edgeflag at the end (if streamout is enabled) */
+      if (info->output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG && sel->info.writes_edgeflag) {
+         LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], "");
+         /* The output is a float, but the hw expects a 1-bit integer. */
+         edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, "");
+         edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1);
+
+         tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
+         tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
+         LLVMBuildStore(builder, edgeflag, tmp);
+      }
+   }
+
+   bool unterminated_es_if_block =
+      !sel->so.num_outputs && !sel->info.writes_edgeflag &&
+      !ctx->screen->use_ngg_streamout && /* no query buffer */
+      (ctx->type != PIPE_SHADER_VERTEX || !ctx->shader->key.mono.u.vs_export_prim_id);
+
+   if (!unterminated_es_if_block)
+      ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+   LLVMValueRef is_gs_thread = si_is_gs_thread(ctx);
+   LLVMValueRef is_es_thread = si_is_es_thread(ctx);
+   LLVMValueRef vtxindex[3];
+
+   if (ctx->shader->key.opt.ngg_culling) {
+      vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9);
+      vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9);
+      vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9);
+   } else {
+      vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
+      vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
+      vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
+   }
+
+   /* Determine the number of vertices per primitive. */
+   unsigned num_vertices;
+   LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices);
+
+   /* Streamout */
+   LLVMValueRef emitted_prims = NULL;
+
+   if (sel->so.num_outputs) {
+      assert(!unterminated_es_if_block);
+
+      struct ngg_streamout nggso = {};
+      nggso.num_vertices = num_vertices_val;
+      nggso.prim_enable[0] = is_gs_thread;
+
+      for (unsigned i = 0; i < num_vertices; ++i)
+         nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
+
+      build_streamout(ctx, &nggso);
+      emitted_prims = nggso.emit[0];
+   }
+
+   LLVMValueRef user_edgeflags[3] = {};
+
+   if (sel->info.writes_edgeflag) {
+      assert(!unterminated_es_if_block);
+
+      /* Streamout already inserted the barrier, so don't insert it again. */
+      if (!sel->so.num_outputs)
+         ac_build_s_barrier(&ctx->ac);
+
+      ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
+      /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
+      for (unsigned i = 0; i < num_vertices; i++) {
+         tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
+         tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
+         tmp = ac_build_gep0(&ctx->ac, tmp, tmp2);
+         tmp = LLVMBuildLoad(builder, tmp, "");
+         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+
+         user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, "");
+         LLVMBuildStore(builder, tmp, user_edgeflags[i]);
+      }
+      ac_build_endif(&ctx->ac, 5400);
+   }
+
+   /* Copy Primitive IDs from GS threads to the LDS address corresponding
+    * to the ES thread of the provoking vertex.
+    */
+   if (ctx->type == PIPE_SHADER_VERTEX && ctx->shader->key.mono.u.vs_export_prim_id) {
+      assert(!unterminated_es_if_block);
+
+      /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
+      if (sel->so.num_outputs || sel->info.writes_edgeflag)
+         ac_build_s_barrier(&ctx->ac);
+
+      ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
+      /* Extract the PROVOKING_VTX_INDEX field. */
+      LLVMValueRef provoking_vtx_in_prim = si_unpack_param(ctx, ctx->vs_state_bits, 4, 2);
+
+      /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
+      LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
+      LLVMValueRef provoking_vtx_index =
+         LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
+      LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index);
+
+      LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id),
+                     ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0));
+      ac_build_endif(&ctx->ac, 5400);
+   }
+
+   /* Update query buffer */
+   if (ctx->screen->use_ngg_streamout && !info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
+      assert(!unterminated_es_if_block);
+
+      tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
+      tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */
+      tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5030);
+      tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac),
+                          sel->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5031);
+      {
+         LLVMValueRef args[] = {
+            ngg_get_prim_cnt(ctx),
+            ngg_get_query_buf(ctx),
+            LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */
+            ctx->ac.i32_0,                        /* soffset */
+            ctx->ac.i32_0,                        /* cachepolicy */
+         };
+
+         if (sel->so.num_outputs) {
+            args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1);
+            args[2] = ac_build_writelane(&ctx->ac, args[2], LLVMConstInt(ctx->ac.i32, 24, false),
+                                         ctx->ac.i32_1);
+         }
+
+         /* TODO: should this be 64-bit atomics? */
+         ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
+                            0);
+      }
+      ac_build_endif(&ctx->ac, 5031);
+      ac_build_endif(&ctx->ac, 5030);
+      ac_build_endif(&ctx->ac, 5029);
+   }
+
+   /* Build the primitive export. */
+   if (!gfx10_ngg_export_prim_early(ctx->shader)) {
+      assert(!unterminated_es_if_block);
+      gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL);
+   }
+
+   /* Export per-vertex data (positions and parameters). */
+   if (!unterminated_es_if_block)
+      ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
+   {
+      unsigned i;
+
+      /* Unconditionally (re-)load the values for proper SSA form. */
+      for (i = 0; i < info->num_outputs; i++) {
+         /* If the NGG cull shader part computed the position, don't
+          * use the position from the current shader part. Instead,
+          * load it from LDS.
+          */
+         if (info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
+             ctx->shader->key.opt.ngg_culling) {
+            vertex_ptr = ngg_nogs_vertex_ptr(ctx, ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id));
+
+            for (unsigned j = 0; j < 4; j++) {
+               tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0);
+               tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
+               tmp = LLVMBuildLoad(builder, tmp, "");
+               outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
+            }
+         } else {
+            for (unsigned j = 0; j < 4; j++) {
+               outputs[i].values[j] = LLVMBuildLoad(builder, addrs[4 * i + j], "");
+            }
+         }
+      }
+
+      if (ctx->shader->key.mono.u.vs_export_prim_id) {
+         outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
+         outputs[i].semantic_index = 0;
+
+         if (ctx->type == PIPE_SHADER_VERTEX) {
+            /* Wait for GS stores to finish. */
+            ac_build_s_barrier(&ctx->ac);
+
+            tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+            tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
+            outputs[i].values[0] = LLVMBuildLoad(builder, tmp, "");
+         } else {
+            assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+            outputs[i].values[0] = si_get_primitive_id(ctx, 0);
+         }
+
+         outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]);
+         for (unsigned j = 1; j < 4; j++)
+            outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32);
+
+         memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));
+         i++;
+      }
+
+      si_llvm_build_vs_exports(ctx, outputs, i);
+   }
+   ac_build_endif(&ctx->ac, 6002);
  }
  
-static LLVMValueRef
-ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
+static LLVMValueRef ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
  {
-       const struct si_shader_selector *sel = ctx->shader->selector;
-       const struct si_shader_info *info = &sel->info;
-
-       LLVMTypeRef elements[2] = {
-               LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
-               LLVMArrayType(ctx->ac.i8, 4),
-       };
-       LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
-       type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
-       return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
+   const struct si_shader_selector *sel = ctx->shader->selector;
+   const struct si_shader_info *info = &sel->info;
+
+   LLVMTypeRef elements[2] = {
+      LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
+      LLVMArrayType(ctx->ac.i8, 4),
+   };
+   LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
+   type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
+   return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
  }
  
  /**
@@ -1536,452 +1458,424 @@ ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
   *
   * \return an LDS pointer to type {[N x i32], [4 x i8]}
   */
-static LLVMValueRef
-ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
+static LLVMValueRef ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
  {
-       struct si_shader_selector *sel = ctx->shader->selector;
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
-
-       /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
-       unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1;
-       if (write_stride_2exp) {
-               LLVMValueRef row =
-                       LLVMBuildLShr(builder, vertexidx,
-                                     LLVMConstInt(ctx->ac.i32, 5, false), "");
-               LLVMValueRef swizzle =
-                       LLVMBuildAnd(builder, row,
-                                    LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1,
-                                                 false), "");
-               vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
-       }
-
-       return ac_build_gep0(&ctx->ac, storage, vertexidx);
+   struct si_shader_selector *sel = ctx->shader->selector;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
+
+   /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
+   unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1;
+   if (write_stride_2exp) {
+      LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), "");
+      LLVMValueRef swizzle = LLVMBuildAnd(
+         builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), "");
+      vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
+   }
+
+   return ac_build_gep0(&ctx->ac, storage, vertexidx);
  }
  
-static LLVMValueRef
-ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
-                      LLVMValueRef emitidx)
+static LLVMValueRef ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
+                                           LLVMValueRef emitidx)
  {
-       struct si_shader_selector *sel = ctx->shader->selector;
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef tmp;
-
-       tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false);
-       tmp = LLVMBuildMul(builder, tmp, gsthread, "");
-       const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
-       return ngg_gs_vertex_ptr(ctx, vertexidx);
+   struct si_shader_selector *sel = ctx->shader->selector;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef tmp;
+
+   tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false);
+   tmp = LLVMBuildMul(builder, tmp, gsthread, "");
+   const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
+   return ngg_gs_vertex_ptr(ctx, vertexidx);
  }
  
-static LLVMValueRef
-ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx, LLVMValueRef vertexptr,
-                          unsigned out_idx)
+static LLVMValueRef ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx,
+                                               LLVMValueRef vertexptr, unsigned out_idx)
  {
-       LLVMValueRef gep_idx[3] = {
-               ctx->ac.i32_0, /* implied C-style array */
-               ctx->ac.i32_0, /* first struct entry */
-               LLVMConstInt(ctx->ac.i32, out_idx, false),
-       };
-       return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
+   LLVMValueRef gep_idx[3] = {
+      ctx->ac.i32_0, /* implied C-style array */
+      ctx->ac.i32_0, /* first struct entry */
+      LLVMConstInt(ctx->ac.i32, out_idx, false),
+   };
+   return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
  }
  
-static LLVMValueRef
-ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx, LLVMValueRef vertexptr,
-                            unsigned stream)
+static LLVMValueRef ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx,
+                                                 LLVMValueRef vertexptr, unsigned stream)
  {
-       LLVMValueRef gep_idx[3] = {
-               ctx->ac.i32_0, /* implied C-style array */
-               ctx->ac.i32_1, /* second struct entry */
-               LLVMConstInt(ctx->ac.i32, stream, false),
-       };
-       return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
+   LLVMValueRef gep_idx[3] = {
+      ctx->ac.i32_0, /* implied C-style array */
+      ctx->ac.i32_1, /* second struct entry */
+      LLVMConstInt(ctx->ac.i32, stream, false),
+   };
+   return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
  }
  
-void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
-                             unsigned stream,
-                             LLVMValueRef *addrs)
+void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs)
  {
-       const struct si_shader_selector *sel = ctx->shader->selector;
-       const struct si_shader_info *info = &sel->info;
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef tmp;
-       const LLVMValueRef vertexidx =
-               LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
-
-       /* If this thread has already emitted the declared maximum number of
-        * vertices, skip the write: excessive vertex emissions are not
-        * supposed to have any effect.
-        */
-       const LLVMValueRef can_emit =
-               LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
-                             LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
-
-       tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
-       tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
-       LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
-
-       ac_build_ifcc(&ctx->ac, can_emit, 9001);
-
-       const LLVMValueRef vertexptr =
-               ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
-       unsigned out_idx = 0;
-       for (unsigned i = 0; i < info->num_outputs; i++) {
-               for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
-                       if (!(info->output_usagemask[i] & (1 << chan)) ||
-                           ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
-                               continue;
-
-                       LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], "");
-                       out_val = ac_to_integer(&ctx->ac, out_val);
-                       LLVMBuildStore(builder, out_val,
-                                      ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
-               }
-       }
-       assert(out_idx * 4 == sel->gsvs_vertex_size);
-
-       /* Determine and store whether this vertex completed a primitive. */
-       const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
-
-       tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false);
-       const LLVMValueRef iscompleteprim =
-               LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
-
-       /* Since the geometry shader emits triangle strips, we need to
-        * track which primitive is odd and swap vertex indices to get
-        * the correct vertex order.
-        */
-       LLVMValueRef is_odd = ctx->ac.i1false;
-       if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) {
-               tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
-               is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
-       }
-
-       tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
-       LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
-
-       /* The per-vertex primitive flag encoding:
-        *   bit 0: whether this vertex finishes a primitive
-        *   bit 1: whether the primitive is odd (if we are emitting triangle strips)
-        */
-       tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
-       tmp = LLVMBuildOr(builder, tmp,
-                         LLVMBuildShl(builder,
-                                      LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""),
-                                      ctx->ac.i8_1, ""), "");
-       LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
-
-       tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
-       tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
-       LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
-
-       ac_build_endif(&ctx->ac, 9001);
+   const struct si_shader_selector *sel = ctx->shader->selector;
+   const struct si_shader_info *info = &sel->info;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef tmp;
+   const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
+
+   /* If this thread has already emitted the declared maximum number of
+    * vertices, skip the write: excessive vertex emissions are not
+    * supposed to have any effect.
+    */
+   const LLVMValueRef can_emit =
+      LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
+                    LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
+
+   tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
+   tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
+   LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
+
+   ac_build_ifcc(&ctx->ac, can_emit, 9001);
+
+   const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
+   unsigned out_idx = 0;
+   for (unsigned i = 0; i < info->num_outputs; i++) {
+      for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
+         if (!(info->output_usagemask[i] & (1 << chan)) ||
+             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
+            continue;
+
+         LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], "");
+         out_val = ac_to_integer(&ctx->ac, out_val);
+         LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
+      }
+   }
+   assert(out_idx * 4 == sel->gsvs_vertex_size);
+
+   /* Determine and store whether this vertex completed a primitive. */
+   const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
+
+   tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false);
+   const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
+
+   /* Since the geometry shader emits triangle strips, we need to
+    * track which primitive is odd and swap vertex indices to get
+    * the correct vertex order.
+    */
+   LLVMValueRef is_odd = ctx->ac.i1false;
+   if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) {
+      tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
+      is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
+   }
+
+   tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
+   LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
+
+   /* The per-vertex primitive flag encoding:
+    *   bit 0: whether this vertex finishes a primitive
+    *   bit 1: whether the primitive is odd (if we are emitting triangle strips)
+    */
+   tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
+   tmp = LLVMBuildOr(
+      builder, tmp,
+      LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), "");
+   LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
+
+   tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
+   tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
+   LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
+
+   ac_build_endif(&ctx->ac, 9001);
  }
  
  void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
  {
-       /* Zero out the part of LDS scratch that is used to accumulate the
-        * per-stream generated primitive count.
-        */
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
-       LLVMValueRef tid = get_thread_id_in_tg(ctx);
-       LLVMValueRef tmp;
-
-       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
-       ac_build_ifcc(&ctx->ac, tmp, 5090);
-       {
-               LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
-               LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
-       }
-       ac_build_endif(&ctx->ac, 5090);
-
-       ac_build_s_barrier(&ctx->ac);
+   /* Zero out the part of LDS scratch that is used to accumulate the
+    * per-stream generated primitive count.
+    */
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
+   LLVMValueRef tid = get_thread_id_in_tg(ctx);
+   LLVMValueRef tmp;
+
+   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
+   ac_build_ifcc(&ctx->ac, tmp, 5090);
+   {
+      LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
+      LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
+   }
+   ac_build_endif(&ctx->ac, 5090);
+
+   ac_build_s_barrier(&ctx->ac);
  }
  
  void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
  {
-       const struct si_shader_selector *sel = ctx->shader->selector;
-       const struct si_shader_info *info = &sel->info;
-       const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim);
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
-       LLVMValueRef tmp, tmp2;
-
-       /* Zero out remaining (non-emitted) primitive flags.
-        *
-        * Note: Alternatively, we could pass the relevant gs_next_vertex to
-        *       the emit threads via LDS. This is likely worse in the expected
-        *       typical case where each GS thread emits the full set of
-        *       vertices.
-        */
-       for (unsigned stream = 0; stream < 4; ++stream) {
-               if (!info->num_stream_output_components[stream])
-                       continue;
-
-               const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
-
-               ac_build_bgnloop(&ctx->ac, 5100);
-
-               const LLVMValueRef vertexidx =
-                       LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
-               tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
-                       LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
-               ac_build_ifcc(&ctx->ac, tmp, 5101);
-               ac_build_break(&ctx->ac);
-               ac_build_endif(&ctx->ac, 5101);
-
-               tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
-               LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
-
-               tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
-               LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
-
-               ac_build_endloop(&ctx->ac, 5100);
-       }
-
-       /* Accumulate generated primitives counts across the entire threadgroup. */
-       for (unsigned stream = 0; stream < 4; ++stream) {
-               if (!info->num_stream_output_components[stream])
-                       continue;
-
-               LLVMValueRef numprims =
-                       LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
-               numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
-
-               tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
-               ac_build_ifcc(&ctx->ac, tmp, 5105);
-               {
-                       LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
-                                          ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
-                                                        LLVMConstInt(ctx->ac.i32, stream, false)),
-                                          numprims, LLVMAtomicOrderingMonotonic, false);
-               }
-               ac_build_endif(&ctx->ac, 5105);
-       }
-
-       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-
-       ac_build_s_barrier(&ctx->ac);
-
-       const LLVMValueRef tid = get_thread_id_in_tg(ctx);
-       LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
-
-       /* Streamout */
-       if (sel->so.num_outputs) {
-               struct ngg_streamout nggso = {};
-
-               nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
-
-               LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
-               for (unsigned stream = 0; stream < 4; ++stream) {
-                       if (!info->num_stream_output_components[stream])
-                               continue;
-
-                       tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
-                       tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-                       tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
-                       nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
-               }
-
-               for (unsigned i = 0; i < verts_per_prim; ++i) {
-                       tmp = LLVMBuildSub(builder, tid,
-                                          LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
-                       tmp = ngg_gs_vertex_ptr(ctx, tmp);
-                       nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
-               }
-
-               build_streamout(ctx, &nggso);
-       }
-
-       /* Write shader query data. */
-       if (ctx->screen->use_ngg_streamout) {
-               tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
-               tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-               ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */
-               unsigned num_query_comps = sel->so.num_outputs ? 8 : 4;
-               tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
-                                   LLVMConstInt(ctx->ac.i32, num_query_comps, false), "");
-               ac_build_ifcc(&ctx->ac, tmp, 5110);
-               {
-                       LLVMValueRef offset;
-                       tmp = tid;
-                       if (sel->so.num_outputs)
-                               tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), "");
-                       offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), "");
-                       if (sel->so.num_outputs) {
-                               tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), "");
-                               tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), "");
-                               offset = LLVMBuildAdd(builder, offset, tmp, "");
-                       }
-
-                       tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
-                       LLVMValueRef args[] = {
-                               tmp,
-                               ngg_get_query_buf(ctx),
-                               offset,
-                               LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */
-                               ctx->ac.i32_0, /* cachepolicy */
-                       };
-                       ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
-                                          ctx->ac.i32, args, 5, 0);
-               }
-               ac_build_endif(&ctx->ac, 5110);
-               ac_build_endif(&ctx->ac, 5109);
-       }
-
-       /* Determine vertex liveness. */
-       LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
-
-       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
-       ac_build_ifcc(&ctx->ac, tmp, 5120);
-       {
-               for (unsigned i = 0; i < verts_per_prim; ++i) {
-                       const LLVMValueRef primidx =
-                               LLVMBuildAdd(builder, tid,
-                                            LLVMConstInt(ctx->ac.i32, i, false), "");
-
-                       if (i > 0) {
-                               tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
-                               ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
-                       }
-
-                       /* Load primitive liveness */
-                       tmp = ngg_gs_vertex_ptr(ctx, primidx);
-                       tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
-                       const LLVMValueRef primlive =
-                               LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-
-                       tmp = LLVMBuildLoad(builder, vertliveptr, "");
-                       tmp = LLVMBuildOr(builder, tmp, primlive, ""),
-                       LLVMBuildStore(builder, tmp, vertliveptr);
-
-                       if (i > 0)
-                               ac_build_endif(&ctx->ac, 5121 + i);
-               }
-       }
-       ac_build_endif(&ctx->ac, 5120);
-
-       /* Inclusive scan addition across the current wave. */
-       LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
-       struct ac_wg_scan vertlive_scan = {};
-       vertlive_scan.op = nir_op_iadd;
-       vertlive_scan.enable_reduce = true;
-       vertlive_scan.enable_exclusive = true;
-       vertlive_scan.src = vertlive;
-       vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
-       vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
-       vertlive_scan.numwaves = get_tgsize(ctx);
-       vertlive_scan.maxwaves = 8;
-
-       ac_build_wg_scan(&ctx->ac, &vertlive_scan);
-
-       /* Skip all exports (including index exports) when possible. At least on
-        * early gfx10 revisions this is also to avoid hangs.
-        */
-       LLVMValueRef have_exports =
-               LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
-       num_emit_threads =
-               LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
-
-       /* Allocate export space. Send this message as early as possible, to
-        * hide the latency of the SQ <-> SPI roundtrip.
-        *
-        * Note: We could consider compacting primitives for export as well.
-        *       PA processes 1 non-null prim / clock, but it fetches 4 DW of
-        *       prim data per clock and skips null primitives at no additional
-        *       cost. So compacting primitives can only be beneficial when
-        *       there are 4 or more contiguous null primitives in the export
-        *       (in the common case of single-dword prim exports).
-        */
-       ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
-                                     vertlive_scan.result_reduce, num_emit_threads);
-
-       /* Setup the reverse vertex compaction permutation. We re-use stream 1
-        * of the primitive liveness flags, relying on the fact that each
-        * threadgroup can have at most 256 threads. */
-       ac_build_ifcc(&ctx->ac, vertlive, 5130);
-       {
-               tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
-               tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
-               LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
-       }
-       ac_build_endif(&ctx->ac, 5130);
-
-       ac_build_s_barrier(&ctx->ac);
-
-       /* Export primitive data */
-       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
-       ac_build_ifcc(&ctx->ac, tmp, 5140);
-       {
-               LLVMValueRef flags;
-               struct ac_ngg_prim prim = {};
-               prim.num_vertices = verts_per_prim;
-
-               tmp = ngg_gs_vertex_ptr(ctx, tid);
-               flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
-               prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
-
-               for (unsigned i = 0; i < verts_per_prim; ++i) {
-                       prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
-                               LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
-                       prim.edgeflag[i] = ctx->ac.i1false;
-               }
-
-               /* Geometry shaders output triangle strips, but NGG expects triangles. */
-               if (verts_per_prim == 3) {
-                       LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
-                       is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
-                       LLVMValueRef flatshade_first =
-                               LLVMBuildICmp(builder, LLVMIntEQ,
-                                             si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
-                                             ctx->ac.i32_0, "");
-
-                       ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
-                                                                   flatshade_first,
-                                                                   prim.index);
-               }
-
-               ac_build_export_prim(&ctx->ac, &prim);
-       }
-       ac_build_endif(&ctx->ac, 5140);
-
-       /* Export position and parameter data */
-       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
-       ac_build_ifcc(&ctx->ac, tmp, 5145);
-       {
-               struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
-
-               tmp = ngg_gs_vertex_ptr(ctx, tid);
-               tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
-               tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
-               const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
-
-               unsigned out_idx = 0;
-               for (unsigned i = 0; i < info->num_outputs; i++) {
-                       outputs[i].semantic_name = info->output_semantic_name[i];
-                       outputs[i].semantic_index = info->output_semantic_index[i];
-
-                       for (unsigned j = 0; j < 4; j++, out_idx++) {
-                               tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
-                               tmp = LLVMBuildLoad(builder, tmp, "");
-                               outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
-                               outputs[i].vertex_stream[j] =
-                                       (info->output_streams[i] >> (2 * j)) & 3;
-                       }
-               }
-
-               si_llvm_build_vs_exports(ctx, outputs, info->num_outputs);
-       }
-       ac_build_endif(&ctx->ac, 5145);
+   const struct si_shader_selector *sel = ctx->shader->selector;
+   const struct si_shader_info *info = &sel->info;
+   const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
+   LLVMValueRef tmp, tmp2;
+
+   /* Zero out remaining (non-emitted) primitive flags.
+    *
+    * Note: Alternatively, we could pass the relevant gs_next_vertex to
+    *       the emit threads via LDS. This is likely worse in the expected
+    *       typical case where each GS thread emits the full set of
+    *       vertices.
+    */
+   for (unsigned stream = 0; stream < 4; ++stream) {
+      if (!info->num_stream_output_components[stream])
+         continue;
+
+      const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
+
+      ac_build_bgnloop(&ctx->ac, 5100);
+
+      const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
+      tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
+                          LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
+      ac_build_ifcc(&ctx->ac, tmp, 5101);
+      ac_build_break(&ctx->ac);
+      ac_build_endif(&ctx->ac, 5101);
+
+      tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
+      LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
+
+      tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
+      LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
+
+      ac_build_endloop(&ctx->ac, 5100);
+   }
+
+   /* Accumulate generated primitives counts across the entire threadgroup. */
+   for (unsigned stream = 0; stream < 4; ++stream) {
+      if (!info->num_stream_output_components[stream])
+         continue;
+
+      LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
+      numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
+
+      tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5105);
+      {
+         LLVMBuildAtomicRMW(
+            builder, LLVMAtomicRMWBinOpAdd,
+            ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)),
+            numprims, LLVMAtomicOrderingMonotonic, false);
+      }
+      ac_build_endif(&ctx->ac, 5105);
+   }
+
+   ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+   ac_build_s_barrier(&ctx->ac);
+
+   const LLVMValueRef tid = get_thread_id_in_tg(ctx);
+   LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
+
+   /* Streamout */
+   if (sel->so.num_outputs) {
+      struct ngg_streamout nggso = {};
+
+      nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
+
+      LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
+      for (unsigned stream = 0; stream < 4; ++stream) {
+         if (!info->num_stream_output_components[stream])
+            continue;
+
+         tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
+         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+         tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
+         nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
+      }
+
+      for (unsigned i = 0; i < verts_per_prim; ++i) {
+         tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false),
+                            "");
+         tmp = ngg_gs_vertex_ptr(ctx, tmp);
+         nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
+      }
+
+      build_streamout(ctx, &nggso);
+   }
+
+   /* Write shader query data. */
+   if (ctx->screen->use_ngg_streamout) {
+      tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
+      tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */
+      unsigned num_query_comps = sel->so.num_outputs ? 8 : 4;
+      tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
+                          LLVMConstInt(ctx->ac.i32, num_query_comps, false), "");
+      ac_build_ifcc(&ctx->ac, tmp, 5110);
+      {
+         LLVMValueRef offset;
+         tmp = tid;
+         if (sel->so.num_outputs)
+            tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), "");
+         offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), "");
+         if (sel->so.num_outputs) {
+            tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), "");
+            tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), "");
+            offset = LLVMBuildAdd(builder, offset, tmp, "");
+         }
+
+         tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
+         LLVMValueRef args[] = {
+            tmp,           ngg_get_query_buf(ctx),
+            offset,        LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */
+            ctx->ac.i32_0,                                       /* cachepolicy */
+         };
+         ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
+                            0);
+      }
+      ac_build_endif(&ctx->ac, 5110);
+      ac_build_endif(&ctx->ac, 5109);
+   }
+
+   /* Determine vertex liveness. */
+   LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
+
+   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
+   ac_build_ifcc(&ctx->ac, tmp, 5120);
+   {
+      for (unsigned i = 0; i < verts_per_prim; ++i) {
+         const LLVMValueRef primidx =
+            LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), "");
+
+         if (i > 0) {
+            tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
+            ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
+         }
+
+         /* Load primitive liveness */
+         tmp = ngg_gs_vertex_ptr(ctx, primidx);
+         tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
+         const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+
+         tmp = LLVMBuildLoad(builder, vertliveptr, "");
+         tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr);
+
+         if (i > 0)
+            ac_build_endif(&ctx->ac, 5121 + i);
+      }
+   }
+   ac_build_endif(&ctx->ac, 5120);
+
+   /* Inclusive scan addition across the current wave. */
+   LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
+   struct ac_wg_scan vertlive_scan = {};
+   vertlive_scan.op = nir_op_iadd;
+   vertlive_scan.enable_reduce = true;
+   vertlive_scan.enable_exclusive = true;
+   vertlive_scan.src = vertlive;
+   vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
+   vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
+   vertlive_scan.numwaves = get_tgsize(ctx);
+   vertlive_scan.maxwaves = 8;
+
+   ac_build_wg_scan(&ctx->ac, &vertlive_scan);
+
+   /* Skip all exports (including index exports) when possible. At least on
+    * early gfx10 revisions this is also to avoid hangs.
+    */
+   LLVMValueRef have_exports =
+      LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
+   num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
+
+   /* Allocate export space. Send this message as early as possible, to
+    * hide the latency of the SQ <-> SPI roundtrip.
+    *
+    * Note: We could consider compacting primitives for export as well.
+    *       PA processes 1 non-null prim / clock, but it fetches 4 DW of
+    *       prim data per clock and skips null primitives at no additional
+    *       cost. So compacting primitives can only be beneficial when
+    *       there are 4 or more contiguous null primitives in the export
+    *       (in the common case of single-dword prim exports).
+    */
+   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce,
+                                 num_emit_threads);
+
+   /* Setup the reverse vertex compaction permutation. We re-use stream 1
+    * of the primitive liveness flags, relying on the fact that each
+    * threadgroup can have at most 256 threads. */
+   ac_build_ifcc(&ctx->ac, vertlive, 5130);
+   {
+      tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
+      tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
+      LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
+   }
+   ac_build_endif(&ctx->ac, 5130);
+
+   ac_build_s_barrier(&ctx->ac);
+
+   /* Export primitive data */
+   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
+   ac_build_ifcc(&ctx->ac, tmp, 5140);
+   {
+      LLVMValueRef flags;
+      struct ac_ngg_prim prim = {};
+      prim.num_vertices = verts_per_prim;
+
+      tmp = ngg_gs_vertex_ptr(ctx, tid);
+      flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
+      prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
+
+      for (unsigned i = 0; i < verts_per_prim; ++i) {
+         prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
+                                      LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
+         prim.edgeflag[i] = ctx->ac.i1false;
+      }
+
+      /* Geometry shaders output triangle strips, but NGG expects triangles. */
+      if (verts_per_prim == 3) {
+         LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
+         is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
+         LLVMValueRef flatshade_first = LLVMBuildICmp(
+            builder, LLVMIntEQ, si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), ctx->ac.i32_0, "");
+
+         ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index);
+      }
+
+      ac_build_export_prim(&ctx->ac, &prim);
+   }
+   ac_build_endif(&ctx->ac, 5140);
+
+   /* Export position and parameter data */
+   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
+   ac_build_ifcc(&ctx->ac, tmp, 5145);
+   {
+      struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
+
+      tmp = ngg_gs_vertex_ptr(ctx, tid);
+      tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
+      tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
+      const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
+
+      unsigned out_idx = 0;
+      for (unsigned i = 0; i < info->num_outputs; i++) {
+         outputs[i].semantic_name = info->output_semantic_name[i];
+         outputs[i].semantic_index = info->output_semantic_index[i];
+
+         for (unsigned j = 0; j < 4; j++, out_idx++) {
+            tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
+            tmp = LLVMBuildLoad(builder, tmp, "");
+            outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
+            outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
+         }
+      }
+
+      si_llvm_build_vs_exports(ctx, outputs, info->num_outputs);
+   }
+   ac_build_endif(&ctx->ac, 5145);
  }
  
  static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
-                                    unsigned min_verts_per_prim, bool use_adjacency)
+                                     unsigned min_verts_per_prim, bool use_adjacency)
  {
-       unsigned max_reuse = max_esverts - min_verts_per_prim;
-       if (use_adjacency)
-               max_reuse /= 2;
-       *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
+   unsigned max_reuse = max_esverts - min_verts_per_prim;
+   if (use_adjacency)
+      max_reuse /= 2;
+   *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
  }
  
  /**
@@ -1992,172 +1886,165 @@ static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts
   */
  void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
  {
-       const struct si_shader_selector *gs_sel = shader->selector;
-       const struct si_shader_selector *es_sel =
-               shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
-       const enum pipe_shader_type gs_type = gs_sel->type;
-       const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
-       const unsigned input_prim = si_get_input_prim(gs_sel);
-       const bool use_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY &&
-                                  input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
-       const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);
-       const unsigned min_verts_per_prim =
-               gs_type == PIPE_SHADER_GEOMETRY ? max_verts_per_prim : 1;
-
-       /* All these are in dwords: */
-       /* We can't allow using the whole LDS, because GS waves compete with
-        * other shader stages for LDS space.
-        *
-        * TODO: We should really take the shader's internal LDS use into
-        *       account. The linker will fail if the size is greater than
-        *       8K dwords.
-        */
-       const unsigned max_lds_size = 8 * 1024 - 768;
-       const unsigned target_lds_size = max_lds_size;
-       unsigned esvert_lds_size = 0;
-       unsigned gsprim_lds_size = 0;
-
-       /* All these are per subgroup: */
-       bool max_vert_out_per_gs_instance = false;
-       unsigned max_gsprims_base = 128; /* default prim group size clamp */
-       unsigned max_esverts_base = 128;
-
-       if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
-               max_gsprims_base = 128 / 3;
-               max_esverts_base = max_gsprims_base * 3;
-       } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
-               max_gsprims_base = 126;
-               max_esverts_base = 128;
-       }
-
-       /* Hardware has the following non-natural restrictions on the value
-        * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
-        * the draw:
-        *  - at most 252 for any line input primitive type
-        *  - at most 251 for any quad input primitive type
-        *  - at most 251 for triangle strips with adjacency (this happens to
-        *    be the natural limit for triangle *lists* with adjacency)
-        */
-       max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
-
-       if (gs_type == PIPE_SHADER_GEOMETRY) {
-               unsigned max_out_verts_per_gsprim =
-                       gs_sel->gs_max_out_vertices * gs_num_invocations;
-
-               if (max_out_verts_per_gsprim <= 256) {
-                       if (max_out_verts_per_gsprim) {
-                               max_gsprims_base = MIN2(max_gsprims_base,
-                                                       256 / max_out_verts_per_gsprim);
-                       }
-               } else {
-                       /* Use special multi-cycling mode in which each GS
-                        * instance gets its own subgroup. Does not work with
-                        * tessellation. */
-                       max_vert_out_per_gs_instance = true;
-                       max_gsprims_base = 1;
-                       max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices;
-               }
-
-               esvert_lds_size = es_sel->esgs_itemsize / 4;
-               gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
-       } else {
-               /* VS and TES. */
-               /* LDS size for passing data from ES to GS. */
-               esvert_lds_size = ngg_nogs_vertex_size(shader);
-       }
-
-       unsigned max_gsprims = max_gsprims_base;
-       unsigned max_esverts = max_esverts_base;
-
-       if (esvert_lds_size)
-               max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
-       if (gsprim_lds_size)
-               max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
-
-       max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-       clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
-       assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
-
-       if (esvert_lds_size || gsprim_lds_size) {
-               /* Now that we have a rough proportionality between esverts
-                * and gsprims based on the primitive type, scale both of them
-                * down simultaneously based on required LDS space.
-                *
-                * We could be smarter about this if we knew how much vertex
-                * reuse to expect.
-                */
-               unsigned lds_total = max_esverts * esvert_lds_size +
-                                    max_gsprims * gsprim_lds_size;
-               if (lds_total > target_lds_size) {
-                       max_esverts = max_esverts * target_lds_size / lds_total;
-                       max_gsprims = max_gsprims * target_lds_size / lds_total;
-
-                       max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-                       clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
-                                                min_verts_per_prim, use_adjacency);
-                       assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
-               }
-       }
-
-       /* Round up towards full wave sizes for better ALU utilization. */
-       if (!max_vert_out_per_gs_instance) {
-               const unsigned wavesize = gs_sel->screen->ge_wave_size;
-               unsigned orig_max_esverts;
-               unsigned orig_max_gsprims;
-               do {
-                       orig_max_esverts = max_esverts;
-                       orig_max_gsprims = max_gsprims;
-
-                       max_esverts = align(max_esverts, wavesize);
-                       max_esverts = MIN2(max_esverts, max_esverts_base);
-                       if (esvert_lds_size)
-                               max_esverts = MIN2(max_esverts,
-                                                  (max_lds_size - max_gsprims * gsprim_lds_size) /
-                                                  esvert_lds_size);
-                       max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-
-                       max_gsprims = align(max_gsprims, wavesize);
-                       max_gsprims = MIN2(max_gsprims, max_gsprims_base);
-                       if (gsprim_lds_size)
-                               max_gsprims = MIN2(max_gsprims,
-                                                  (max_lds_size - max_esverts * esvert_lds_size) /
-                                                  gsprim_lds_size);
-                       clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
-                                                min_verts_per_prim, use_adjacency);
-                       assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
-               } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
-       }
-
-       /* Hardware restriction: minimum value of max_esverts */
-       max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim);
-
-       unsigned max_out_vertices =
-               max_vert_out_per_gs_instance ? gs_sel->gs_max_out_vertices :
-               gs_type == PIPE_SHADER_GEOMETRY ?
-               max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices :
-               max_esverts;
-       assert(max_out_vertices <= 256);
-
-       unsigned prim_amp_factor = 1;
-       if (gs_type == PIPE_SHADER_GEOMETRY) {
-               /* Number of output primitives per GS input primitive after
-                * GS instancing. */
-               prim_amp_factor = gs_sel->gs_max_out_vertices;
-       }
-
-       /* The GE only checks against the maximum number of ES verts after
-        * allocating a full GS primitive. So we need to ensure that whenever
-        * this check passes, there is enough space for a full primitive without
-        * vertex reuse.
-        */
-       shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
-       shader->ngg.max_gsprims = max_gsprims;
-       shader->ngg.max_out_verts = max_out_vertices;
-       shader->ngg.prim_amp_factor = prim_amp_factor;
-       shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
-
-       shader->gs_info.esgs_ring_size = 4 * max_esverts * esvert_lds_size;
-       shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
-
-       assert(shader->ngg.hw_max_esverts >= 24); /* HW limitation */
+   const struct si_shader_selector *gs_sel = shader->selector;
+   const struct si_shader_selector *es_sel =
+      shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
+   const enum pipe_shader_type gs_type = gs_sel->type;
+   const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
+   const unsigned input_prim = si_get_input_prim(gs_sel);
+   const bool use_adjacency =
+      input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
+   const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);
+   const unsigned min_verts_per_prim = gs_type == PIPE_SHADER_GEOMETRY ? max_verts_per_prim : 1;
+
+   /* All these are in dwords: */
+   /* We can't allow using the whole LDS, because GS waves compete with
+    * other shader stages for LDS space.
+    *
+    * TODO: We should really take the shader's internal LDS use into
+    *       account. The linker will fail if the size is greater than
+    *       8K dwords.
+    */
+   const unsigned max_lds_size = 8 * 1024 - 768;
+   const unsigned target_lds_size = max_lds_size;
+   unsigned esvert_lds_size = 0;
+   unsigned gsprim_lds_size = 0;
+
+   /* All these are per subgroup: */
+   bool max_vert_out_per_gs_instance = false;
+   unsigned max_gsprims_base = 128; /* default prim group size clamp */
+   unsigned max_esverts_base = 128;
+
+   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
+      max_gsprims_base = 128 / 3;
+      max_esverts_base = max_gsprims_base * 3;
+   } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
+      max_gsprims_base = 126;
+      max_esverts_base = 128;
+   }
+
+   /* Hardware has the following non-natural restrictions on the value
+    * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
+    * the draw:
+    *  - at most 252 for any line input primitive type
+    *  - at most 251 for any quad input primitive type
+    *  - at most 251 for triangle strips with adjacency (this happens to
+    *    be the natural limit for triangle *lists* with adjacency)
+    */
+   max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
+
+   if (gs_type == PIPE_SHADER_GEOMETRY) {
+      unsigned max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices * gs_num_invocations;
+
+      if (max_out_verts_per_gsprim <= 256) {
+         if (max_out_verts_per_gsprim) {
+            max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
+         }
+      } else {
+         /* Use special multi-cycling mode in which each GS
+          * instance gets its own subgroup. Does not work with
+          * tessellation. */
+         max_vert_out_per_gs_instance = true;
+         max_gsprims_base = 1;
+         max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices;
+      }
+
+      esvert_lds_size = es_sel->esgs_itemsize / 4;
+      gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
+   } else {
+      /* VS and TES. */
+      /* LDS size for passing data from ES to GS. */
+      esvert_lds_size = ngg_nogs_vertex_size(shader);
+   }
+
+   unsigned max_gsprims = max_gsprims_base;
+   unsigned max_esverts = max_esverts_base;
+
+   if (esvert_lds_size)
+      max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
+   if (gsprim_lds_size)
+      max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
+
+   max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+   clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
+   assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+
+   if (esvert_lds_size || gsprim_lds_size) {
+      /* Now that we have a rough proportionality between esverts
+       * and gsprims based on the primitive type, scale both of them
+       * down simultaneously based on required LDS space.
+       *
+       * We could be smarter about this if we knew how much vertex
+       * reuse to expect.
+       */
+      unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
+      if (lds_total > target_lds_size) {
+         max_esverts = max_esverts * target_lds_size / lds_total;
+         max_gsprims = max_gsprims * target_lds_size / lds_total;
+
+         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
+         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+      }
+   }
+
+   /* Round up towards full wave sizes for better ALU utilization. */
+   if (!max_vert_out_per_gs_instance) {
+      const unsigned wavesize = gs_sel->screen->ge_wave_size;
+      unsigned orig_max_esverts;
+      unsigned orig_max_gsprims;
+      do {
+         orig_max_esverts = max_esverts;
+         orig_max_gsprims = max_gsprims;
+
+         max_esverts = align(max_esverts, wavesize);
+         max_esverts = MIN2(max_esverts, max_esverts_base);
+         if (esvert_lds_size)
+            max_esverts =
+               MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
+         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+
+         max_gsprims = align(max_gsprims, wavesize);
+         max_gsprims = MIN2(max_gsprims, max_gsprims_base);
+         if (gsprim_lds_size)
+            max_gsprims =
+               MIN2(max_gsprims, (max_lds_size - max_esverts * esvert_lds_size) / gsprim_lds_size);
+         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
+         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+      } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
+   }
+
+   /* Hardware restriction: minimum value of max_esverts */
+   max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim);
+
+   unsigned max_out_vertices =
+      max_vert_out_per_gs_instance
+         ? gs_sel->gs_max_out_vertices
+         : gs_type == PIPE_SHADER_GEOMETRY
+              ? max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices
+              : max_esverts;
+   assert(max_out_vertices <= 256);
+
+   unsigned prim_amp_factor = 1;
+   if (gs_type == PIPE_SHADER_GEOMETRY) {
+      /* Number of output primitives per GS input primitive after
+       * GS instancing. */
+      prim_amp_factor = gs_sel->gs_max_out_vertices;
+   }
+
+   /* The GE only checks against the maximum number of ES verts after
+    * allocating a full GS primitive. So we need to ensure that whenever
+    * this check passes, there is enough space for a full primitive without
+    * vertex reuse.
+    */
+   shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
+   shader->ngg.max_gsprims = max_gsprims;
+   shader->ngg.max_out_verts = max_out_vertices;
+   shader->ngg.prim_amp_factor = prim_amp_factor;
+   shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
+
+   shader->gs_info.esgs_ring_size = 4 * max_esverts * esvert_lds_size;
+   shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
+
+   assert(shader->ngg.hw_max_esverts >= 24); /* HW limitation */
  }
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c

index e662de1612745a769c3ec044a70fd400c1b99af9..ab69c7e4ddda318b902068324cd2db32072b2e8c 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -23,1346 +23,1220 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "si_pipe.h"
  #include "si_compute.h"
+#include "si_pipe.h"
  #include "util/format/u_format.h"
  #include "util/u_log.h"
  #include "util/u_surface.h"
  
-enum {
-       SI_COPY          = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES |
-                          SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND,
+enum
+{
+   SI_COPY =
+      SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND,
  
-       SI_BLIT          = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES |
-                          SI_SAVE_FRAGMENT_STATE,
+   SI_BLIT = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | SI_SAVE_FRAGMENT_STATE,
  
-       SI_DECOMPRESS    = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE |
-                          SI_DISABLE_RENDER_COND,
+   SI_DECOMPRESS = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND,
  
-       SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE
+   SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE
  };
  
  void si_blitter_begin(struct si_context *sctx, enum si_blitter_op op)
  {
-       util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
-       util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso);
-       util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso);
-       util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
-       util_blitter_save_so_targets(sctx->blitter, sctx->streamout.num_targets,
-                                    (struct pipe_stream_output_target**)sctx->streamout.targets);
-       util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
-
-       if (op & SI_SAVE_FRAGMENT_STATE) {
-               util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend);
-               util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
-               util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
-               util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
-               util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask);
-               util_blitter_save_scissor(sctx->blitter, &sctx->scissors[0]);
-               util_blitter_save_window_rectangles(sctx->blitter,
-                                                   sctx->window_rectangles_include,
-                                                   sctx->num_window_rectangles,
-                                                   sctx->window_rectangles);
-       }
-
-       if (op & SI_SAVE_FRAMEBUFFER)
-               util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state);
-
-       if (op & SI_SAVE_TEXTURES) {
-               util_blitter_save_fragment_sampler_states(
-                       sctx->blitter, 2,
-                       (void**)sctx->samplers[PIPE_SHADER_FRAGMENT].sampler_states);
-
-               util_blitter_save_fragment_sampler_views(sctx->blitter, 2,
-                       sctx->samplers[PIPE_SHADER_FRAGMENT].views);
-       }
-
-       if (op & SI_DISABLE_RENDER_COND)
-               sctx->render_cond_force_off = true;
-
-       if (sctx->screen->dpbb_allowed) {
-               sctx->dpbb_force_off = true;
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-       }
+   util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
+   util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso);
+   util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso);
+   util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
+   util_blitter_save_so_targets(sctx->blitter, sctx->streamout.num_targets,
+                                (struct pipe_stream_output_target **)sctx->streamout.targets);
+   util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
+
+   if (op & SI_SAVE_FRAGMENT_STATE) {
+      util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend);
+      util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
+      util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
+      util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
+      util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask);
+      util_blitter_save_scissor(sctx->blitter, &sctx->scissors[0]);
+      util_blitter_save_window_rectangles(sctx->blitter, sctx->window_rectangles_include,
+                                          sctx->num_window_rectangles, sctx->window_rectangles);
+   }
+
+   if (op & SI_SAVE_FRAMEBUFFER)
+      util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state);
+
+   if (op & SI_SAVE_TEXTURES) {
+      util_blitter_save_fragment_sampler_states(
+         sctx->blitter, 2, (void **)sctx->samplers[PIPE_SHADER_FRAGMENT].sampler_states);
+
+      util_blitter_save_fragment_sampler_views(sctx->blitter, 2,
+                                               sctx->samplers[PIPE_SHADER_FRAGMENT].views);
+   }
+
+   if (op & SI_DISABLE_RENDER_COND)
+      sctx->render_cond_force_off = true;
+
+   if (sctx->screen->dpbb_allowed) {
+      sctx->dpbb_force_off = true;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+   }
  }
  
  void si_blitter_end(struct si_context *sctx)
  {
-       if (sctx->screen->dpbb_allowed) {
-               sctx->dpbb_force_off = false;
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-       }
-
-       sctx->render_cond_force_off = false;
-
-       /* Restore shader pointers because the VS blit shader changed all
-        * non-global VS user SGPRs. */
-       sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX);
-       sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
-       sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0;
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+   if (sctx->screen->dpbb_allowed) {
+      sctx->dpbb_force_off = false;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+   }
+
+   sctx->render_cond_force_off = false;
+
+   /* Restore shader pointers because the VS blit shader changed all
+    * non-global VS user SGPRs. */
+   sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX);
+   sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+   sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
  }
  
  static unsigned u_max_sample(struct pipe_resource *r)
  {
-       return r->nr_samples ? r->nr_samples - 1 : 0;
+   return r->nr_samples ? r->nr_samples - 1 : 0;
  }
  
-static unsigned
-si_blit_dbcb_copy(struct si_context *sctx,
-                 struct si_texture *src,
-                 struct si_texture *dst,
-                 unsigned planes, unsigned level_mask,
-                 unsigned first_layer, unsigned last_layer,
-                 unsigned first_sample, unsigned last_sample)
+static unsigned si_blit_dbcb_copy(struct si_context *sctx, struct si_texture *src,
+                                  struct si_texture *dst, unsigned planes, unsigned level_mask,
+                                  unsigned first_layer, unsigned last_layer, unsigned first_sample,
+                                  unsigned last_sample)
  {
-       struct pipe_surface surf_tmpl = {{0}};
-       unsigned layer, sample, checked_last_layer, max_layer;
-       unsigned fully_copied_levels = 0;
+   struct pipe_surface surf_tmpl = {{0}};
+   unsigned layer, sample, checked_last_layer, max_layer;
+   unsigned fully_copied_levels = 0;
  
-       if (planes & PIPE_MASK_Z)
-               sctx->dbcb_depth_copy_enabled = true;
-       if (planes & PIPE_MASK_S)
-               sctx->dbcb_stencil_copy_enabled = true;
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   if (planes & PIPE_MASK_Z)
+      sctx->dbcb_depth_copy_enabled = true;
+   if (planes & PIPE_MASK_S)
+      sctx->dbcb_stencil_copy_enabled = true;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
  
-       assert(sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled);
+   assert(sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled);
  
-       sctx->decompression_enabled = true;
+   sctx->decompression_enabled = true;
  
-       while (level_mask) {
-               unsigned level = u_bit_scan(&level_mask);
+   while (level_mask) {
+      unsigned level = u_bit_scan(&level_mask);
  
-               /* The smaller the mipmap level, the less layers there are
-                * as far as 3D textures are concerned. */
-               max_layer = util_max_layer(&src->buffer.b.b, level);
-               checked_last_layer = MIN2(last_layer, max_layer);
+      /* The smaller the mipmap level, the less layers there are
+       * as far as 3D textures are concerned. */
+      max_layer = util_max_layer(&src->buffer.b.b, level);
+      checked_last_layer = MIN2(last_layer, max_layer);
  
-               surf_tmpl.u.tex.level = level;
+      surf_tmpl.u.tex.level = level;
  
-               for (layer = first_layer; layer <= checked_last_layer; layer++) {
-                       struct pipe_surface *zsurf, *cbsurf;
+      for (layer = first_layer; layer <= checked_last_layer; layer++) {
+         struct pipe_surface *zsurf, *cbsurf;
  
-                       surf_tmpl.format = src->buffer.b.b.format;
-                       surf_tmpl.u.tex.first_layer = layer;
-                       surf_tmpl.u.tex.last_layer = layer;
+         surf_tmpl.format = src->buffer.b.b.format;
+         surf_tmpl.u.tex.first_layer = layer;
+         surf_tmpl.u.tex.last_layer = layer;
  
-                       zsurf = sctx->b.create_surface(&sctx->b, &src->buffer.b.b, &surf_tmpl);
+         zsurf = sctx->b.create_surface(&sctx->b, &src->buffer.b.b, &surf_tmpl);
  
-                       surf_tmpl.format = dst->buffer.b.b.format;
-                       cbsurf = sctx->b.create_surface(&sctx->b, &dst->buffer.b.b, &surf_tmpl);
+         surf_tmpl.format = dst->buffer.b.b.format;
+         cbsurf = sctx->b.create_surface(&sctx->b, &dst->buffer.b.b, &surf_tmpl);
  
-                       for (sample = first_sample; sample <= last_sample; sample++) {
-                               if (sample != sctx->dbcb_copy_sample) {
-                                       sctx->dbcb_copy_sample = sample;
-                                       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-                               }
+         for (sample = first_sample; sample <= last_sample; sample++) {
+            if (sample != sctx->dbcb_copy_sample) {
+               sctx->dbcb_copy_sample = sample;
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+            }
  
-                               si_blitter_begin(sctx, SI_DECOMPRESS);
-                               util_blitter_custom_depth_stencil(sctx->blitter, zsurf, cbsurf, 1 << sample,
-                                                                 sctx->custom_dsa_flush, 1.0f);
-                               si_blitter_end(sctx);
-                       }
+            si_blitter_begin(sctx, SI_DECOMPRESS);
+            util_blitter_custom_depth_stencil(sctx->blitter, zsurf, cbsurf, 1 << sample,
+                                              sctx->custom_dsa_flush, 1.0f);
+            si_blitter_end(sctx);
+         }
  
-                       pipe_surface_reference(&zsurf, NULL);
-                       pipe_surface_reference(&cbsurf, NULL);
-               }
+         pipe_surface_reference(&zsurf, NULL);
+         pipe_surface_reference(&cbsurf, NULL);
+      }
  
-               if (first_layer == 0 && last_layer >= max_layer &&
-                   first_sample == 0 && last_sample >= u_max_sample(&src->buffer.b.b))
-                       fully_copied_levels |= 1u << level;
-       }
+      if (first_layer == 0 && last_layer >= max_layer && first_sample == 0 &&
+          last_sample >= u_max_sample(&src->buffer.b.b))
+         fully_copied_levels |= 1u << level;
+   }
  
-       sctx->decompression_enabled = false;
-       sctx->dbcb_depth_copy_enabled = false;
-       sctx->dbcb_stencil_copy_enabled = false;
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   sctx->decompression_enabled = false;
+   sctx->dbcb_depth_copy_enabled = false;
+   sctx->dbcb_stencil_copy_enabled = false;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
  
-       return fully_copied_levels;
+   return fully_copied_levels;
  }
  
  /* Helper function for si_blit_decompress_zs_in_place.
   */
-static void
-si_blit_decompress_zs_planes_in_place(struct si_context *sctx,
-                                     struct si_texture *texture,
-                                     unsigned planes, unsigned level_mask,
-                                     unsigned first_layer, unsigned last_layer)
+static void si_blit_decompress_zs_planes_in_place(struct si_context *sctx,
+                                                  struct si_texture *texture, unsigned planes,
+                                                  unsigned level_mask, unsigned first_layer,
+                                                  unsigned last_layer)
  {
-       struct pipe_surface *zsurf, surf_tmpl = {{0}};
-       unsigned layer, max_layer, checked_last_layer;
-       unsigned fully_decompressed_mask = 0;
+   struct pipe_surface *zsurf, surf_tmpl = {{0}};
+   unsigned layer, max_layer, checked_last_layer;
+   unsigned fully_decompressed_mask = 0;
  
-       if (!level_mask)
-               return;
+   if (!level_mask)
+      return;
  
-       if (planes & PIPE_MASK_S)
-               sctx->db_flush_stencil_inplace = true;
-       if (planes & PIPE_MASK_Z)
-               sctx->db_flush_depth_inplace = true;
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   if (planes & PIPE_MASK_S)
+      sctx->db_flush_stencil_inplace = true;
+   if (planes & PIPE_MASK_Z)
+      sctx->db_flush_depth_inplace = true;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
  
-       surf_tmpl.format = texture->buffer.b.b.format;
+   surf_tmpl.format = texture->buffer.b.b.format;
  
-       sctx->decompression_enabled = true;
+   sctx->decompression_enabled = true;
  
-       while (level_mask) {
-               unsigned level = u_bit_scan(&level_mask);
+   while (level_mask) {
+      unsigned level = u_bit_scan(&level_mask);
  
-               surf_tmpl.u.tex.level = level;
+      surf_tmpl.u.tex.level = level;
  
-               /* The smaller the mipmap level, the less layers there are
-                * as far as 3D textures are concerned. */
-               max_layer = util_max_layer(&texture->buffer.b.b, level);
-               checked_last_layer = MIN2(last_layer, max_layer);
+      /* The smaller the mipmap level, the less layers there are
+       * as far as 3D textures are concerned. */
+      max_layer = util_max_layer(&texture->buffer.b.b, level);
+      checked_last_layer = MIN2(last_layer, max_layer);
  
-               for (layer = first_layer; layer <= checked_last_layer; layer++) {
-                       surf_tmpl.u.tex.first_layer = layer;
-                       surf_tmpl.u.tex.last_layer = layer;
+      for (layer = first_layer; layer <= checked_last_layer; layer++) {
+         surf_tmpl.u.tex.first_layer = layer;
+         surf_tmpl.u.tex.last_layer = layer;
  
-                       zsurf = sctx->b.create_surface(&sctx->b, &texture->buffer.b.b, &surf_tmpl);
+         zsurf = sctx->b.create_surface(&sctx->b, &texture->buffer.b.b, &surf_tmpl);
  
-                       si_blitter_begin(sctx, SI_DECOMPRESS);
-                       util_blitter_custom_depth_stencil(sctx->blitter, zsurf, NULL, ~0,
-                                                         sctx->custom_dsa_flush,
-                                                         1.0f);
-                       si_blitter_end(sctx);
+         si_blitter_begin(sctx, SI_DECOMPRESS);
+         util_blitter_custom_depth_stencil(sctx->blitter, zsurf, NULL, ~0, sctx->custom_dsa_flush,
+                                           1.0f);
+         si_blitter_end(sctx);
  
-                       pipe_surface_reference(&zsurf, NULL);
-               }
+         pipe_surface_reference(&zsurf, NULL);
+      }
  
-               /* The texture will always be dirty if some layers aren't flushed.
-                * I don't think this case occurs often though. */
-               if (first_layer == 0 && last_layer >= max_layer) {
-                       fully_decompressed_mask |= 1u << level;
-               }
-       }
+      /* The texture will always be dirty if some layers aren't flushed.
+       * I don't think this case occurs often though. */
+      if (first_layer == 0 && last_layer >= max_layer) {
+         fully_decompressed_mask |= 1u << level;
+      }
+   }
  
-       if (planes & PIPE_MASK_Z)
-               texture->dirty_level_mask &= ~fully_decompressed_mask;
-       if (planes & PIPE_MASK_S)
-               texture->stencil_dirty_level_mask &= ~fully_decompressed_mask;
+   if (planes & PIPE_MASK_Z)
+      texture->dirty_level_mask &= ~fully_decompressed_mask;
+   if (planes & PIPE_MASK_S)
+      texture->stencil_dirty_level_mask &= ~fully_decompressed_mask;
  
-       sctx->decompression_enabled = false;
-       sctx->db_flush_depth_inplace = false;
-       sctx->db_flush_stencil_inplace = false;
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   sctx->decompression_enabled = false;
+   sctx->db_flush_depth_inplace = false;
+   sctx->db_flush_stencil_inplace = false;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
  }
  
  /* Helper function of si_flush_depth_texture: decompress the given levels
   * of Z and/or S planes in place.
   */
-static void
-si_blit_decompress_zs_in_place(struct si_context *sctx,
-                              struct si_texture *texture,
-                              unsigned levels_z, unsigned levels_s,
-                              unsigned first_layer, unsigned last_layer)
+static void si_blit_decompress_zs_in_place(struct si_context *sctx, struct si_texture *texture,
+                                           unsigned levels_z, unsigned levels_s,
+                                           unsigned first_layer, unsigned last_layer)
  {
-       unsigned both = levels_z & levels_s;
-
-       /* First, do combined Z & S decompresses for levels that need it. */
-       if (both) {
-               si_blit_decompress_zs_planes_in_place(
-                               sctx, texture, PIPE_MASK_Z | PIPE_MASK_S,
-                               both,
-                               first_layer, last_layer);
-               levels_z &= ~both;
-               levels_s &= ~both;
-       }
-
-       /* Now do separate Z and S decompresses. */
-       if (levels_z) {
-               si_blit_decompress_zs_planes_in_place(
-                               sctx, texture, PIPE_MASK_Z,
-                               levels_z,
-                               first_layer, last_layer);
-       }
-
-       if (levels_s) {
-               si_blit_decompress_zs_planes_in_place(
-                               sctx, texture, PIPE_MASK_S,
-                               levels_s,
-                               first_layer, last_layer);
-       }
+   unsigned both = levels_z & levels_s;
+
+   /* First, do combined Z & S decompresses for levels that need it. */
+   if (both) {
+      si_blit_decompress_zs_planes_in_place(sctx, texture, PIPE_MASK_Z | PIPE_MASK_S, both,
+                                            first_layer, last_layer);
+      levels_z &= ~both;
+      levels_s &= ~both;
+   }
+
+   /* Now do separate Z and S decompresses. */
+   if (levels_z) {
+      si_blit_decompress_zs_planes_in_place(sctx, texture, PIPE_MASK_Z, levels_z, first_layer,
+                                            last_layer);
+   }
+
+   if (levels_s) {
+      si_blit_decompress_zs_planes_in_place(sctx, texture, PIPE_MASK_S, levels_s, first_layer,
+                                            last_layer);
+   }
  }
  
-static void
-si_decompress_depth(struct si_context *sctx,
-                   struct si_texture *tex,
-                   unsigned required_planes,
-                   unsigned first_level, unsigned last_level,
-                   unsigned first_layer, unsigned last_layer)
+static void si_decompress_depth(struct si_context *sctx, struct si_texture *tex,
+                                unsigned required_planes, unsigned first_level, unsigned last_level,
+                                unsigned first_layer, unsigned last_layer)
  {
-       unsigned inplace_planes = 0;
-       unsigned copy_planes = 0;
-       unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1);
-       unsigned levels_z = 0;
-       unsigned levels_s = 0;
-
-       if (required_planes & PIPE_MASK_Z) {
-               levels_z = level_mask & tex->dirty_level_mask;
-
-               if (levels_z) {
-                       if (si_can_sample_zs(tex, false))
-                               inplace_planes |= PIPE_MASK_Z;
-                       else
-                               copy_planes |= PIPE_MASK_Z;
-               }
-       }
-       if (required_planes & PIPE_MASK_S) {
-               levels_s = level_mask & tex->stencil_dirty_level_mask;
-
-               if (levels_s) {
-                       if (si_can_sample_zs(tex, true))
-                               inplace_planes |= PIPE_MASK_S;
-                       else
-                               copy_planes |= PIPE_MASK_S;
-               }
-       }
-
-       if (unlikely(sctx->log))
-               u_log_printf(sctx->log,
-                            "\n------------------------------------------------\n"
-                            "Decompress Depth (levels %u - %u, levels Z: 0x%x S: 0x%x)\n\n",
-                            first_level, last_level, levels_z, levels_s);
-
-       /* We may have to allocate the flushed texture here when called from
-        * si_decompress_subresource.
-        */
-       if (copy_planes &&
-           (tex->flushed_depth_texture ||
-            si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b))) {
-               struct si_texture *dst = tex->flushed_depth_texture;
-               unsigned fully_copied_levels;
-               unsigned levels = 0;
-
-               assert(tex->flushed_depth_texture);
-
-               if (util_format_is_depth_and_stencil(dst->buffer.b.b.format))
-                       copy_planes = PIPE_MASK_Z | PIPE_MASK_S;
-
-               if (copy_planes & PIPE_MASK_Z) {
-                       levels |= levels_z;
-                       levels_z = 0;
-               }
-               if (copy_planes & PIPE_MASK_S) {
-                       levels |= levels_s;
-                       levels_s = 0;
-               }
-
-               fully_copied_levels = si_blit_dbcb_copy(
-                       sctx, tex, dst, copy_planes, levels,
-                       first_layer, last_layer,
-                       0, u_max_sample(&tex->buffer.b.b));
-
-               if (copy_planes & PIPE_MASK_Z)
-                       tex->dirty_level_mask &= ~fully_copied_levels;
-               if (copy_planes & PIPE_MASK_S)
-                       tex->stencil_dirty_level_mask &= ~fully_copied_levels;
-       }
-
-       if (inplace_planes) {
-               bool has_htile = si_htile_enabled(tex, first_level, inplace_planes);
-               bool tc_compat_htile = vi_tc_compat_htile_enabled(tex, first_level,
-                                                                 inplace_planes);
-
-               /* Don't decompress if there is no HTILE or when HTILE is
-                * TC-compatible. */
-               if (has_htile && !tc_compat_htile) {
-                       si_blit_decompress_zs_in_place(
-                                               sctx, tex,
-                                               levels_z, levels_s,
-                                               first_layer, last_layer);
-               } else {
-                       /* This is only a cache flush.
-                        *
-                        * Only clear the mask that we are flushing, because
-                        * si_make_DB_shader_coherent() treats different levels
-                        * and depth and stencil differently.
-                        */
-                       if (inplace_planes & PIPE_MASK_Z)
-                               tex->dirty_level_mask &= ~levels_z;
-                       if (inplace_planes & PIPE_MASK_S)
-                               tex->stencil_dirty_level_mask &= ~levels_s;
-               }
-
-               /* Only in-place decompression needs to flush DB caches, or
-                * when we don't decompress but TC-compatible planes are dirty.
-                */
-               si_make_DB_shader_coherent(sctx, tex->buffer.b.b.nr_samples,
-                                          inplace_planes & PIPE_MASK_S,
-                                          tc_compat_htile);
-       }
-       /* set_framebuffer_state takes care of coherency for single-sample.
-        * The DB->CB copy uses CB for the final writes.
-        */
-       if (copy_planes && tex->buffer.b.b.nr_samples > 1)
-               si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples,
-                                          false, true /* no DCC */);
+   unsigned inplace_planes = 0;
+   unsigned copy_planes = 0;
+   unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1);
+   unsigned levels_z = 0;
+   unsigned levels_s = 0;
+
+   if (required_planes & PIPE_MASK_Z) {
+      levels_z = level_mask & tex->dirty_level_mask;
+
+      if (levels_z) {
+         if (si_can_sample_zs(tex, false))
+            inplace_planes |= PIPE_MASK_Z;
+         else
+            copy_planes |= PIPE_MASK_Z;
+      }
+   }
+   if (required_planes & PIPE_MASK_S) {
+      levels_s = level_mask & tex->stencil_dirty_level_mask;
+
+      if (levels_s) {
+         if (si_can_sample_zs(tex, true))
+            inplace_planes |= PIPE_MASK_S;
+         else
+            copy_planes |= PIPE_MASK_S;
+      }
+   }
+
+   if (unlikely(sctx->log))
+      u_log_printf(sctx->log,
+                   "\n------------------------------------------------\n"
+                   "Decompress Depth (levels %u - %u, levels Z: 0x%x S: 0x%x)\n\n",
+                   first_level, last_level, levels_z, levels_s);
+
+   /* We may have to allocate the flushed texture here when called from
+    * si_decompress_subresource.
+    */
+   if (copy_planes &&
+       (tex->flushed_depth_texture || si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b))) {
+      struct si_texture *dst = tex->flushed_depth_texture;
+      unsigned fully_copied_levels;
+      unsigned levels = 0;
+
+      assert(tex->flushed_depth_texture);
+
+      if (util_format_is_depth_and_stencil(dst->buffer.b.b.format))
+         copy_planes = PIPE_MASK_Z | PIPE_MASK_S;
+
+      if (copy_planes & PIPE_MASK_Z) {
+         levels |= levels_z;
+         levels_z = 0;
+      }
+      if (copy_planes & PIPE_MASK_S) {
+         levels |= levels_s;
+         levels_s = 0;
+      }
+
+      fully_copied_levels = si_blit_dbcb_copy(sctx, tex, dst, copy_planes, levels, first_layer,
+                                              last_layer, 0, u_max_sample(&tex->buffer.b.b));
+
+      if (copy_planes & PIPE_MASK_Z)
+         tex->dirty_level_mask &= ~fully_copied_levels;
+      if (copy_planes & PIPE_MASK_S)
+         tex->stencil_dirty_level_mask &= ~fully_copied_levels;
+   }
+
+   if (inplace_planes) {
+      bool has_htile = si_htile_enabled(tex, first_level, inplace_planes);
+      bool tc_compat_htile = vi_tc_compat_htile_enabled(tex, first_level, inplace_planes);
+
+      /* Don't decompress if there is no HTILE or when HTILE is
+       * TC-compatible. */
+      if (has_htile && !tc_compat_htile) {
+         si_blit_decompress_zs_in_place(sctx, tex, levels_z, levels_s, first_layer, last_layer);
+      } else {
+         /* This is only a cache flush.
+          *
+          * Only clear the mask that we are flushing, because
+          * si_make_DB_shader_coherent() treats different levels
+          * and depth and stencil differently.
+          */
+         if (inplace_planes & PIPE_MASK_Z)
+            tex->dirty_level_mask &= ~levels_z;
+         if (inplace_planes & PIPE_MASK_S)
+            tex->stencil_dirty_level_mask &= ~levels_s;
+      }
+
+      /* Only in-place decompression needs to flush DB caches, or
+       * when we don't decompress but TC-compatible planes are dirty.
+       */
+      si_make_DB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, inplace_planes & PIPE_MASK_S,
+                                 tc_compat_htile);
+   }
+   /* set_framebuffer_state takes care of coherency for single-sample.
+    * The DB->CB copy uses CB for the final writes.
+    */
+   if (copy_planes && tex->buffer.b.b.nr_samples > 1)
+      si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, false, true /* no DCC */);
  }
  
-static void
-si_decompress_sampler_depth_textures(struct si_context *sctx,
-                                    struct si_samplers *textures)
+static void si_decompress_sampler_depth_textures(struct si_context *sctx,
+                                                 struct si_samplers *textures)
  {
-       unsigned i;
-       unsigned mask = textures->needs_depth_decompress_mask;
+   unsigned i;
+   unsigned mask = textures->needs_depth_decompress_mask;
  
-       while (mask) {
-               struct pipe_sampler_view *view;
-               struct si_sampler_view *sview;
-               struct si_texture *tex;
+   while (mask) {
+      struct pipe_sampler_view *view;
+      struct si_sampler_view *sview;
+      struct si_texture *tex;
  
-               i = u_bit_scan(&mask);
+      i = u_bit_scan(&mask);
  
-               view = textures->views[i];
-               assert(view);
-               sview = (struct si_sampler_view*)view;
+      view = textures->views[i];
+      assert(view);
+      sview = (struct si_sampler_view *)view;
  
-               tex = (struct si_texture *)view->texture;
-               assert(tex->db_compatible);
+      tex = (struct si_texture *)view->texture;
+      assert(tex->db_compatible);
  
-               si_decompress_depth(sctx, tex,
-                                   sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
-                                   view->u.tex.first_level, view->u.tex.last_level,
-                                   0, util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
-       }
+      si_decompress_depth(sctx, tex, sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
+                          view->u.tex.first_level, view->u.tex.last_level, 0,
+                          util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
+   }
  }
  
-static void si_blit_decompress_color(struct si_context *sctx,
-                                    struct si_texture *tex,
-                                    unsigned first_level, unsigned last_level,
-                                    unsigned first_layer, unsigned last_layer,
-                                    bool need_dcc_decompress,
-                                    bool need_fmask_expand)
+static void si_blit_decompress_color(struct si_context *sctx, struct si_texture *tex,
+                                     unsigned first_level, unsigned last_level,
+                                     unsigned first_layer, unsigned last_layer,
+                                     bool need_dcc_decompress, bool need_fmask_expand)
  {
-       void* custom_blend;
-       unsigned layer, checked_last_layer, max_layer;
-       unsigned level_mask =
-               u_bit_consecutive(first_level, last_level - first_level + 1);
-
-       if (!need_dcc_decompress)
-               level_mask &= tex->dirty_level_mask;
-       if (!level_mask)
-               goto expand_fmask;
-
-       if (unlikely(sctx->log))
-               u_log_printf(sctx->log,
-                            "\n------------------------------------------------\n"
-                            "Decompress Color (levels %u - %u, mask 0x%x)\n\n",
-                            first_level, last_level, level_mask);
-
-       if (need_dcc_decompress) {
-               custom_blend = sctx->custom_blend_dcc_decompress;
-
-               assert(tex->surface.dcc_offset);
-
-               /* disable levels without DCC */
-               for (int i = first_level; i <= last_level; i++) {
-                       if (!vi_dcc_enabled(tex, i))
-                               level_mask &= ~(1 << i);
-               }
-       } else if (tex->surface.fmask_size) {
-               custom_blend = sctx->custom_blend_fmask_decompress;
-       } else {
-               custom_blend = sctx->custom_blend_eliminate_fastclear;
-       }
-
-       sctx->decompression_enabled = true;
-
-       while (level_mask) {
-               unsigned level = u_bit_scan(&level_mask);
-
-               /* The smaller the mipmap level, the less layers there are
-                * as far as 3D textures are concerned. */
-               max_layer = util_max_layer(&tex->buffer.b.b, level);
-               checked_last_layer = MIN2(last_layer, max_layer);
-
-               for (layer = first_layer; layer <= checked_last_layer; layer++) {
-                       struct pipe_surface *cbsurf, surf_tmpl;
-
-                       surf_tmpl.format = tex->buffer.b.b.format;
-                       surf_tmpl.u.tex.level = level;
-                       surf_tmpl.u.tex.first_layer = layer;
-                       surf_tmpl.u.tex.last_layer = layer;
-                       cbsurf = sctx->b.create_surface(&sctx->b, &tex->buffer.b.b, &surf_tmpl);
-
-                       /* Required before and after FMASK and DCC_DECOMPRESS. */
-                       if (custom_blend == sctx->custom_blend_fmask_decompress ||
-                           custom_blend == sctx->custom_blend_dcc_decompress)
-                               sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
-
-                       si_blitter_begin(sctx, SI_DECOMPRESS);
-                       util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend);
-                       si_blitter_end(sctx);
-
-                       if (custom_blend == sctx->custom_blend_fmask_decompress ||
-                           custom_blend == sctx->custom_blend_dcc_decompress)
-                               sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
-
-                       pipe_surface_reference(&cbsurf, NULL);
-               }
-
-               /* The texture will always be dirty if some layers aren't flushed.
-                * I don't think this case occurs often though. */
-               if (first_layer == 0 && last_layer >= max_layer) {
-                       tex->dirty_level_mask &= ~(1 << level);
-               }
-       }
-
-       sctx->decompression_enabled = false;
-       si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples,
-                                  vi_dcc_enabled(tex, first_level),
-                                  tex->surface.u.gfx9.dcc.pipe_aligned);
+   void *custom_blend;
+   unsigned layer, checked_last_layer, max_layer;
+   unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1);
+
+   if (!need_dcc_decompress)
+      level_mask &= tex->dirty_level_mask;
+   if (!level_mask)
+      goto expand_fmask;
+
+   if (unlikely(sctx->log))
+      u_log_printf(sctx->log,
+                   "\n------------------------------------------------\n"
+                   "Decompress Color (levels %u - %u, mask 0x%x)\n\n",
+                   first_level, last_level, level_mask);
+
+   if (need_dcc_decompress) {
+      custom_blend = sctx->custom_blend_dcc_decompress;
+
+      assert(tex->surface.dcc_offset);
+
+      /* disable levels without DCC */
+      for (int i = first_level; i <= last_level; i++) {
+         if (!vi_dcc_enabled(tex, i))
+            level_mask &= ~(1 << i);
+      }
+   } else if (tex->surface.fmask_size) {
+      custom_blend = sctx->custom_blend_fmask_decompress;
+   } else {
+      custom_blend = sctx->custom_blend_eliminate_fastclear;
+   }
+
+   sctx->decompression_enabled = true;
+
+   while (level_mask) {
+      unsigned level = u_bit_scan(&level_mask);
+
+      /* The smaller the mipmap level, the less layers there are
+       * as far as 3D textures are concerned. */
+      max_layer = util_max_layer(&tex->buffer.b.b, level);
+      checked_last_layer = MIN2(last_layer, max_layer);
+
+      for (layer = first_layer; layer <= checked_last_layer; layer++) {
+         struct pipe_surface *cbsurf, surf_tmpl;
+
+         surf_tmpl.format = tex->buffer.b.b.format;
+         surf_tmpl.u.tex.level = level;
+         surf_tmpl.u.tex.first_layer = layer;
+         surf_tmpl.u.tex.last_layer = layer;
+         cbsurf = sctx->b.create_surface(&sctx->b, &tex->buffer.b.b, &surf_tmpl);
+
+         /* Required before and after FMASK and DCC_DECOMPRESS. */
+         if (custom_blend == sctx->custom_blend_fmask_decompress ||
+             custom_blend == sctx->custom_blend_dcc_decompress)
+            sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
+         si_blitter_begin(sctx, SI_DECOMPRESS);
+         util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend);
+         si_blitter_end(sctx);
+
+         if (custom_blend == sctx->custom_blend_fmask_decompress ||
+             custom_blend == sctx->custom_blend_dcc_decompress)
+            sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
+         pipe_surface_reference(&cbsurf, NULL);
+      }
+
+      /* The texture will always be dirty if some layers aren't flushed.
+       * I don't think this case occurs often though. */
+      if (first_layer == 0 && last_layer >= max_layer) {
+         tex->dirty_level_mask &= ~(1 << level);
+      }
+   }
+
+   sctx->decompression_enabled = false;
+   si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, vi_dcc_enabled(tex, first_level),
+                              tex->surface.u.gfx9.dcc.pipe_aligned);
  
  expand_fmask:
-       if (need_fmask_expand && tex->surface.fmask_offset && !tex->fmask_is_identity) {
-               si_compute_expand_fmask(&sctx->b, &tex->buffer.b.b);
-               tex->fmask_is_identity = true;
-       }
+   if (need_fmask_expand && tex->surface.fmask_offset && !tex->fmask_is_identity) {
+      si_compute_expand_fmask(&sctx->b, &tex->buffer.b.b);
+      tex->fmask_is_identity = true;
+   }
  }
  
-static void
-si_decompress_color_texture(struct si_context *sctx, struct si_texture *tex,
-                           unsigned first_level, unsigned last_level,
-                           bool need_fmask_expand)
+static void si_decompress_color_texture(struct si_context *sctx, struct si_texture *tex,
+                                        unsigned first_level, unsigned last_level,
+                                        bool need_fmask_expand)
  {
-       /* CMASK or DCC can be discarded and we can still end up here. */
-       if (!tex->cmask_buffer && !tex->surface.fmask_size && !tex->surface.dcc_offset)
-               return;
+   /* CMASK or DCC can be discarded and we can still end up here. */
+   if (!tex->cmask_buffer && !tex->surface.fmask_size && !tex->surface.dcc_offset)
+      return;
  
-       si_blit_decompress_color(sctx, tex, first_level, last_level, 0,
-                                util_max_layer(&tex->buffer.b.b, first_level),
-                                false, need_fmask_expand);
+   si_blit_decompress_color(sctx, tex, first_level, last_level, 0,
+                            util_max_layer(&tex->buffer.b.b, first_level), false,
+                            need_fmask_expand);
  }
  
-static void
-si_decompress_sampler_color_textures(struct si_context *sctx,
-                                    struct si_samplers *textures)
+static void si_decompress_sampler_color_textures(struct si_context *sctx,
+                                                 struct si_samplers *textures)
  {
-       unsigned i;
-       unsigned mask = textures->needs_color_decompress_mask;
+   unsigned i;
+   unsigned mask = textures->needs_color_decompress_mask;
  
-       while (mask) {
-               struct pipe_sampler_view *view;
-               struct si_texture *tex;
+   while (mask) {
+      struct pipe_sampler_view *view;
+      struct si_texture *tex;
  
-               i = u_bit_scan(&mask);
+      i = u_bit_scan(&mask);
  
-               view = textures->views[i];
-               assert(view);
+      view = textures->views[i];
+      assert(view);
  
-               tex = (struct si_texture *)view->texture;
+      tex = (struct si_texture *)view->texture;
  
-               si_decompress_color_texture(sctx, tex, view->u.tex.first_level,
-                                           view->u.tex.last_level, false);
-       }
+      si_decompress_color_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level,
+                                  false);
+   }
  }
  
-static void
-si_decompress_image_color_textures(struct si_context *sctx,
-                                  struct si_images *images)
+static void si_decompress_image_color_textures(struct si_context *sctx, struct si_images *images)
  {
-       unsigned i;
-       unsigned mask = images->needs_color_decompress_mask;
+   unsigned i;
+   unsigned mask = images->needs_color_decompress_mask;
  
-       while (mask) {
-               const struct pipe_image_view *view;
-               struct si_texture *tex;
+   while (mask) {
+      const struct pipe_image_view *view;
+      struct si_texture *tex;
  
-               i = u_bit_scan(&mask);
+      i = u_bit_scan(&mask);
  
-               view = &images->views[i];
-               assert(view->resource->target != PIPE_BUFFER);
+      view = &images->views[i];
+      assert(view->resource->target != PIPE_BUFFER);
  
-               tex = (struct si_texture *)view->resource;
+      tex = (struct si_texture *)view->resource;
  
-               si_decompress_color_texture(sctx, tex, view->u.tex.level,
-                                           view->u.tex.level,
-                                           view->access & PIPE_IMAGE_ACCESS_WRITE);
-       }
+      si_decompress_color_texture(sctx, tex, view->u.tex.level, view->u.tex.level,
+                                  view->access & PIPE_IMAGE_ACCESS_WRITE);
+   }
  }
  
-static void si_check_render_feedback_texture(struct si_context *sctx,
-                                            struct si_texture *tex,
-                                            unsigned first_level,
-                                            unsigned last_level,
-                                            unsigned first_layer,
-                                            unsigned last_layer)
+static void si_check_render_feedback_texture(struct si_context *sctx, struct si_texture *tex,
+                                             unsigned first_level, unsigned last_level,
+                                             unsigned first_layer, unsigned last_layer)
  {
-       bool render_feedback = false;
+   bool render_feedback = false;
  
-       if (!tex->surface.dcc_offset)
-               return;
+   if (!tex->surface.dcc_offset)
+      return;
  
-       for (unsigned j = 0; j < sctx->framebuffer.state.nr_cbufs; ++j) {
-               struct si_surface * surf;
+   for (unsigned j = 0; j < sctx->framebuffer.state.nr_cbufs; ++j) {
+      struct si_surface *surf;
  
-               if (!sctx->framebuffer.state.cbufs[j])
-                       continue;
+      if (!sctx->framebuffer.state.cbufs[j])
+         continue;
  
-               surf = (struct si_surface*)sctx->framebuffer.state.cbufs[j];
+      surf = (struct si_surface *)sctx->framebuffer.state.cbufs[j];
  
-               if (tex == (struct si_texture *)surf->base.texture &&
-                   surf->base.u.tex.level >= first_level &&
-                   surf->base.u.tex.level <= last_level &&
-                   surf->base.u.tex.first_layer <= last_layer &&
-                   surf->base.u.tex.last_layer >= first_layer) {
-                       render_feedback = true;
-                       break;
-               }
-       }
+      if (tex == (struct si_texture *)surf->base.texture && surf->base.u.tex.level >= first_level &&
+          surf->base.u.tex.level <= last_level && surf->base.u.tex.first_layer <= last_layer &&
+          surf->base.u.tex.last_layer >= first_layer) {
+         render_feedback = true;
+         break;
+      }
+   }
  
-       if (render_feedback)
-               si_texture_disable_dcc(sctx, tex);
+   if (render_feedback)
+      si_texture_disable_dcc(sctx, tex);
  }
  
-static void si_check_render_feedback_textures(struct si_context *sctx,
-                                              struct si_samplers *textures)
+static void si_check_render_feedback_textures(struct si_context *sctx, struct si_samplers *textures)
  {
-       uint32_t mask = textures->enabled_mask;
+   uint32_t mask = textures->enabled_mask;
  
-       while (mask) {
-               const struct pipe_sampler_view *view;
-               struct si_texture *tex;
+   while (mask) {
+      const struct pipe_sampler_view *view;
+      struct si_texture *tex;
  
-               unsigned i = u_bit_scan(&mask);
+      unsigned i = u_bit_scan(&mask);
  
-               view = textures->views[i];
-               if(view->texture->target == PIPE_BUFFER)
-                       continue;
+      view = textures->views[i];
+      if (view->texture->target == PIPE_BUFFER)
+         continue;
  
-               tex = (struct si_texture *)view->texture;
+      tex = (struct si_texture *)view->texture;
  
-               si_check_render_feedback_texture(sctx, tex,
-                                                view->u.tex.first_level,
-                                                view->u.tex.last_level,
-                                                view->u.tex.first_layer,
-                                                view->u.tex.last_layer);
-       }
+      si_check_render_feedback_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level,
+                                       view->u.tex.first_layer, view->u.tex.last_layer);
+   }
  }
  
-static void si_check_render_feedback_images(struct si_context *sctx,
-                                            struct si_images *images)
+static void si_check_render_feedback_images(struct si_context *sctx, struct si_images *images)
  {
-       uint32_t mask = images->enabled_mask;
+   uint32_t mask = images->enabled_mask;
  
-       while (mask) {
-               const struct pipe_image_view *view;
-               struct si_texture *tex;
+   while (mask) {
+      const struct pipe_image_view *view;
+      struct si_texture *tex;
  
-               unsigned i = u_bit_scan(&mask);
+      unsigned i = u_bit_scan(&mask);
  
-               view = &images->views[i];
-               if (view->resource->target == PIPE_BUFFER)
-                       continue;
+      view = &images->views[i];
+      if (view->resource->target == PIPE_BUFFER)
+         continue;
  
-               tex = (struct si_texture *)view->resource;
+      tex = (struct si_texture *)view->resource;
  
-               si_check_render_feedback_texture(sctx, tex,
-                                                view->u.tex.level,
-                                                view->u.tex.level,
-                                                view->u.tex.first_layer,
-                                                view->u.tex.last_layer);
-       }
+      si_check_render_feedback_texture(sctx, tex, view->u.tex.level, view->u.tex.level,
+                                       view->u.tex.first_layer, view->u.tex.last_layer);
+   }
  }
  
  static void si_check_render_feedback_resident_textures(struct si_context *sctx)
  {
-       util_dynarray_foreach(&sctx->resident_tex_handles,
-                             struct si_texture_handle *, tex_handle) {
-               struct pipe_sampler_view *view;
-               struct si_texture *tex;
-
-               view = (*tex_handle)->view;
-               if (view->texture->target == PIPE_BUFFER)
-                       continue;
-
-               tex = (struct si_texture *)view->texture;
-
-               si_check_render_feedback_texture(sctx, tex,
-                                                view->u.tex.first_level,
-                                                view->u.tex.last_level,
-                                                view->u.tex.first_layer,
-                                                view->u.tex.last_layer);
-       }
+   util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+      struct pipe_sampler_view *view;
+      struct si_texture *tex;
+
+      view = (*tex_handle)->view;
+      if (view->texture->target == PIPE_BUFFER)
+         continue;
+
+      tex = (struct si_texture *)view->texture;
+
+      si_check_render_feedback_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level,
+                                       view->u.tex.first_layer, view->u.tex.last_layer);
+   }
  }
  
  static void si_check_render_feedback_resident_images(struct si_context *sctx)
  {
-       util_dynarray_foreach(&sctx->resident_img_handles,
-                             struct si_image_handle *, img_handle) {
-               struct pipe_image_view *view;
-               struct si_texture *tex;
-
-               view = &(*img_handle)->view;
-               if (view->resource->target == PIPE_BUFFER)
-                       continue;
-
-               tex = (struct si_texture *)view->resource;
-
-               si_check_render_feedback_texture(sctx, tex,
-                                                view->u.tex.level,
-                                                view->u.tex.level,
-                                                view->u.tex.first_layer,
-                                                view->u.tex.last_layer);
-       }
+   util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+      struct pipe_image_view *view;
+      struct si_texture *tex;
+
+      view = &(*img_handle)->view;
+      if (view->resource->target == PIPE_BUFFER)
+         continue;
+
+      tex = (struct si_texture *)view->resource;
+
+      si_check_render_feedback_texture(sctx, tex, view->u.tex.level, view->u.tex.level,
+                                       view->u.tex.first_layer, view->u.tex.last_layer);
+   }
  }
  
  static void si_check_render_feedback(struct si_context *sctx)
  {
-       if (!sctx->need_check_render_feedback)
-               return;
+   if (!sctx->need_check_render_feedback)
+      return;
  
-       /* There is no render feedback if color writes are disabled.
-        * (e.g. a pixel shader with image stores)
-        */
-       if (!si_get_total_colormask(sctx))
-               return;
+   /* There is no render feedback if color writes are disabled.
+    * (e.g. a pixel shader with image stores)
+    */
+   if (!si_get_total_colormask(sctx))
+      return;
  
-       for (int i = 0; i < SI_NUM_SHADERS; ++i) {
-               si_check_render_feedback_images(sctx, &sctx->images[i]);
-               si_check_render_feedback_textures(sctx, &sctx->samplers[i]);
-       }
+   for (int i = 0; i < SI_NUM_SHADERS; ++i) {
+      si_check_render_feedback_images(sctx, &sctx->images[i]);
+      si_check_render_feedback_textures(sctx, &sctx->samplers[i]);
+   }
  
-       si_check_render_feedback_resident_images(sctx);
-       si_check_render_feedback_resident_textures(sctx);
+   si_check_render_feedback_resident_images(sctx);
+   si_check_render_feedback_resident_textures(sctx);
  
-       sctx->need_check_render_feedback = false;
+   sctx->need_check_render_feedback = false;
  }
  
  static void si_decompress_resident_textures(struct si_context *sctx)
  {
-       util_dynarray_foreach(&sctx->resident_tex_needs_color_decompress,
-                             struct si_texture_handle *, tex_handle) {
-               struct pipe_sampler_view *view = (*tex_handle)->view;
-               struct si_texture *tex = (struct si_texture *)view->texture;
-
-               si_decompress_color_texture(sctx, tex, view->u.tex.first_level,
-                                           view->u.tex.last_level, false);
-       }
-
-       util_dynarray_foreach(&sctx->resident_tex_needs_depth_decompress,
-                             struct si_texture_handle *, tex_handle) {
-               struct pipe_sampler_view *view = (*tex_handle)->view;
-               struct si_sampler_view *sview = (struct si_sampler_view *)view;
-               struct si_texture *tex = (struct si_texture *)view->texture;
-
-               si_decompress_depth(sctx, tex,
-                       sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
-                       view->u.tex.first_level, view->u.tex.last_level,
-                       0, util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
-       }
+   util_dynarray_foreach (&sctx->resident_tex_needs_color_decompress, struct si_texture_handle *,
+                          tex_handle) {
+      struct pipe_sampler_view *view = (*tex_handle)->view;
+      struct si_texture *tex = (struct si_texture *)view->texture;
+
+      si_decompress_color_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level,
+                                  false);
+   }
+
+   util_dynarray_foreach (&sctx->resident_tex_needs_depth_decompress, struct si_texture_handle *,
+                          tex_handle) {
+      struct pipe_sampler_view *view = (*tex_handle)->view;
+      struct si_sampler_view *sview = (struct si_sampler_view *)view;
+      struct si_texture *tex = (struct si_texture *)view->texture;
+
+      si_decompress_depth(sctx, tex, sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
+                          view->u.tex.first_level, view->u.tex.last_level, 0,
+                          util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
+   }
  }
  
  static void si_decompress_resident_images(struct si_context *sctx)
  {
-       util_dynarray_foreach(&sctx->resident_img_needs_color_decompress,
-                             struct si_image_handle *, img_handle) {
-               struct pipe_image_view *view = &(*img_handle)->view;
-               struct si_texture *tex = (struct si_texture *)view->resource;
-
-               si_decompress_color_texture(sctx, tex, view->u.tex.level,
-                                           view->u.tex.level,
-                                           view->access & PIPE_IMAGE_ACCESS_WRITE);
-       }
+   util_dynarray_foreach (&sctx->resident_img_needs_color_decompress, struct si_image_handle *,
+                          img_handle) {
+      struct pipe_image_view *view = &(*img_handle)->view;
+      struct si_texture *tex = (struct si_texture *)view->resource;
+
+      si_decompress_color_texture(sctx, tex, view->u.tex.level, view->u.tex.level,
+                                  view->access & PIPE_IMAGE_ACCESS_WRITE);
+   }
  }
  
  void si_decompress_textures(struct si_context *sctx, unsigned shader_mask)
  {
-       unsigned compressed_colortex_counter, mask;
-
-       if (sctx->blitter->running)
-               return;
-
-       /* Update the compressed_colortex_mask if necessary. */
-       compressed_colortex_counter = p_atomic_read(&sctx->screen->compressed_colortex_counter);
-       if (compressed_colortex_counter != sctx->last_compressed_colortex_counter) {
-               sctx->last_compressed_colortex_counter = compressed_colortex_counter;
-               si_update_needs_color_decompress_masks(sctx);
-       }
-
-       /* Decompress color & depth textures if needed. */
-       mask = sctx->shader_needs_decompress_mask & shader_mask;
-       while (mask) {
-               unsigned i = u_bit_scan(&mask);
-
-               if (sctx->samplers[i].needs_depth_decompress_mask) {
-                       si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]);
-               }
-               if (sctx->samplers[i].needs_color_decompress_mask) {
-                       si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]);
-               }
-               if (sctx->images[i].needs_color_decompress_mask) {
-                       si_decompress_image_color_textures(sctx, &sctx->images[i]);
-               }
-       }
-
-       if (shader_mask & u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)) {
-               if (sctx->uses_bindless_samplers)
-                       si_decompress_resident_textures(sctx);
-               if (sctx->uses_bindless_images)
-                       si_decompress_resident_images(sctx);
-
-               if (sctx->ps_uses_fbfetch) {
-                       struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
-                       si_decompress_color_texture(sctx,
-                                                   (struct si_texture*)cb0->texture,
-                                                   cb0->u.tex.first_layer,
-                                                   cb0->u.tex.last_layer, false);
-               }
-
-               si_check_render_feedback(sctx);
-       } else if (shader_mask & (1 << PIPE_SHADER_COMPUTE)) {
-               if (sctx->cs_shader_state.program->sel.info.uses_bindless_samplers)
-                       si_decompress_resident_textures(sctx);
-               if (sctx->cs_shader_state.program->sel.info.uses_bindless_images)
-                       si_decompress_resident_images(sctx);
-       }
+   unsigned compressed_colortex_counter, mask;
+
+   if (sctx->blitter->running)
+      return;
+
+   /* Update the compressed_colortex_mask if necessary. */
+   compressed_colortex_counter = p_atomic_read(&sctx->screen->compressed_colortex_counter);
+   if (compressed_colortex_counter != sctx->last_compressed_colortex_counter) {
+      sctx->last_compressed_colortex_counter = compressed_colortex_counter;
+      si_update_needs_color_decompress_masks(sctx);
+   }
+
+   /* Decompress color & depth textures if needed. */
+   mask = sctx->shader_needs_decompress_mask & shader_mask;
+   while (mask) {
+      unsigned i = u_bit_scan(&mask);
+
+      if (sctx->samplers[i].needs_depth_decompress_mask) {
+         si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]);
+      }
+      if (sctx->samplers[i].needs_color_decompress_mask) {
+         si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]);
+      }
+      if (sctx->images[i].needs_color_decompress_mask) {
+         si_decompress_image_color_textures(sctx, &sctx->images[i]);
+      }
+   }
+
+   if (shader_mask & u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)) {
+      if (sctx->uses_bindless_samplers)
+         si_decompress_resident_textures(sctx);
+      if (sctx->uses_bindless_images)
+         si_decompress_resident_images(sctx);
+
+      if (sctx->ps_uses_fbfetch) {
+         struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
+         si_decompress_color_texture(sctx, (struct si_texture *)cb0->texture,
+                                     cb0->u.tex.first_layer, cb0->u.tex.last_layer, false);
+      }
+
+      si_check_render_feedback(sctx);
+   } else if (shader_mask & (1 << PIPE_SHADER_COMPUTE)) {
+      if (sctx->cs_shader_state.program->sel.info.uses_bindless_samplers)
+         si_decompress_resident_textures(sctx);
+      if (sctx->cs_shader_state.program->sel.info.uses_bindless_images)
+         si_decompress_resident_images(sctx);
+   }
  }
  
  /* Helper for decompressing a portion of a color or depth resource before
   * blitting if any decompression is needed.
   * The driver doesn't decompress resources automatically while u_blitter is
   * rendering. */
-void si_decompress_subresource(struct pipe_context *ctx,
-                              struct pipe_resource *tex,
-                              unsigned planes, unsigned level,
-                              unsigned first_layer, unsigned last_layer)
+void si_decompress_subresource(struct pipe_context *ctx, struct pipe_resource *tex, unsigned planes,
+                               unsigned level, unsigned first_layer, unsigned last_layer)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_texture *stex = (struct si_texture*)tex;
-
-       if (stex->db_compatible) {
-               planes &= PIPE_MASK_Z | PIPE_MASK_S;
-
-               if (!stex->surface.has_stencil)
-                       planes &= ~PIPE_MASK_S;
-
-               /* If we've rendered into the framebuffer and it's a blitting
-                * source, make sure the decompression pass is invoked
-                * by dirtying the framebuffer.
-                */
-               if (sctx->framebuffer.state.zsbuf &&
-                   sctx->framebuffer.state.zsbuf->u.tex.level == level &&
-                   sctx->framebuffer.state.zsbuf->texture == tex)
-                       si_update_fb_dirtiness_after_rendering(sctx);
-
-               si_decompress_depth(sctx, stex, planes,
-                                   level, level,
-                                   first_layer, last_layer);
-       } else if (stex->surface.fmask_size || stex->cmask_buffer || stex->surface.dcc_offset) {
-               /* If we've rendered into the framebuffer and it's a blitting
-                * source, make sure the decompression pass is invoked
-                * by dirtying the framebuffer.
-                */
-               for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-                       if (sctx->framebuffer.state.cbufs[i] &&
-                           sctx->framebuffer.state.cbufs[i]->u.tex.level == level &&
-                           sctx->framebuffer.state.cbufs[i]->texture == tex) {
-                               si_update_fb_dirtiness_after_rendering(sctx);
-                               break;
-                       }
-               }
-
-               si_blit_decompress_color(sctx, stex, level, level,
-                                        first_layer, last_layer, false, false);
-       }
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *stex = (struct si_texture *)tex;
+
+   if (stex->db_compatible) {
+      planes &= PIPE_MASK_Z | PIPE_MASK_S;
+
+      if (!stex->surface.has_stencil)
+         planes &= ~PIPE_MASK_S;
+
+      /* If we've rendered into the framebuffer and it's a blitting
+       * source, make sure the decompression pass is invoked
+       * by dirtying the framebuffer.
+       */
+      if (sctx->framebuffer.state.zsbuf && sctx->framebuffer.state.zsbuf->u.tex.level == level &&
+          sctx->framebuffer.state.zsbuf->texture == tex)
+         si_update_fb_dirtiness_after_rendering(sctx);
+
+      si_decompress_depth(sctx, stex, planes, level, level, first_layer, last_layer);
+   } else if (stex->surface.fmask_size || stex->cmask_buffer || stex->surface.dcc_offset) {
+      /* If we've rendered into the framebuffer and it's a blitting
+       * source, make sure the decompression pass is invoked
+       * by dirtying the framebuffer.
+       */
+      for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+         if (sctx->framebuffer.state.cbufs[i] &&
+             sctx->framebuffer.state.cbufs[i]->u.tex.level == level &&
+             sctx->framebuffer.state.cbufs[i]->texture == tex) {
+            si_update_fb_dirtiness_after_rendering(sctx);
+            break;
+         }
+      }
+
+      si_blit_decompress_color(sctx, stex, level, level, first_layer, last_layer, false, false);
+   }
  }
  
  struct texture_orig_info {
-       unsigned format;
-       unsigned width0;
-       unsigned height0;
-       unsigned npix_x;
-       unsigned npix_y;
-       unsigned npix0_x;
-       unsigned npix0_y;
+   unsigned format;
+   unsigned width0;
+   unsigned height0;
+   unsigned npix_x;
+   unsigned npix_y;
+   unsigned npix0_x;
+   unsigned npix0_y;
  };
  
-void si_resource_copy_region(struct pipe_context *ctx,
-                            struct pipe_resource *dst,
-                            unsigned dst_level,
-                            unsigned dstx, unsigned dsty, unsigned dstz,
-                            struct pipe_resource *src,
-                            unsigned src_level,
-                            const struct pipe_box *src_box)
+void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst,
+                             unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
+                             struct pipe_resource *src, unsigned src_level,
+                             const struct pipe_box *src_box)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_texture *ssrc = (struct si_texture*)src;
-       struct si_texture *sdst = (struct si_texture*)dst;
-       struct pipe_surface *dst_view, dst_templ;
-       struct pipe_sampler_view src_templ, *src_view;
-       unsigned dst_width, dst_height, src_width0, src_height0;
-       unsigned dst_width0, dst_height0, src_force_level = 0;
-       struct pipe_box sbox, dstbox;
-
-       /* Handle buffers first. */
-       if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
-               si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width);
-               return;
-       }
-
-       if (!util_format_is_compressed(src->format) &&
-           !util_format_is_compressed(dst->format) &&
-           !util_format_is_depth_or_stencil(src->format) &&
-           src->nr_samples <= 1 &&
-           !sdst->surface.dcc_offset &&
-           !(dst->target != src->target &&
-             (src->target == PIPE_TEXTURE_1D_ARRAY || dst->target == PIPE_TEXTURE_1D_ARRAY))) {
-               si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, src_box);
-               return;
-       }
-
-       assert(u_max_sample(dst) == u_max_sample(src));
-
-       /* The driver doesn't decompress resources automatically while
-        * u_blitter is rendering. */
-       si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level,
-                                 src_box->z, src_box->z + src_box->depth - 1);
-
-       dst_width = u_minify(dst->width0, dst_level);
-       dst_height = u_minify(dst->height0, dst_level);
-       dst_width0 = dst->width0;
-       dst_height0 = dst->height0;
-       src_width0 = src->width0;
-       src_height0 = src->height0;
-
-       util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
-       util_blitter_default_src_texture(sctx->blitter, &src_templ, src, src_level);
-
-       if (util_format_is_compressed(src->format) ||
-           util_format_is_compressed(dst->format)) {
-               unsigned blocksize = ssrc->surface.bpe;
-
-               if (blocksize == 8)
-                       src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
-               else
-                       src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
-               dst_templ.format = src_templ.format;
-
-               dst_width = util_format_get_nblocksx(dst->format, dst_width);
-               dst_height = util_format_get_nblocksy(dst->format, dst_height);
-               dst_width0 = util_format_get_nblocksx(dst->format, dst_width0);
-               dst_height0 = util_format_get_nblocksy(dst->format, dst_height0);
-               src_width0 = util_format_get_nblocksx(src->format, src_width0);
-               src_height0 = util_format_get_nblocksy(src->format, src_height0);
-
-               dstx = util_format_get_nblocksx(dst->format, dstx);
-               dsty = util_format_get_nblocksy(dst->format, dsty);
-
-               sbox.x = util_format_get_nblocksx(src->format, src_box->x);
-               sbox.y = util_format_get_nblocksy(src->format, src_box->y);
-               sbox.z = src_box->z;
-               sbox.width = util_format_get_nblocksx(src->format, src_box->width);
-               sbox.height = util_format_get_nblocksy(src->format, src_box->height);
-               sbox.depth = src_box->depth;
-               src_box = &sbox;
-
-               src_force_level = src_level;
-       } else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src)) {
-               if (util_format_is_subsampled_422(src->format)) {
-                       src_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
-                       dst_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
-
-                       dst_width = util_format_get_nblocksx(dst->format, dst_width);
-                       dst_width0 = util_format_get_nblocksx(dst->format, dst_width0);
-                       src_width0 = util_format_get_nblocksx(src->format, src_width0);
-
-                       dstx = util_format_get_nblocksx(dst->format, dstx);
-
-                       sbox = *src_box;
-                       sbox.x = util_format_get_nblocksx(src->format, src_box->x);
-                       sbox.width = util_format_get_nblocksx(src->format, src_box->width);
-                       src_box = &sbox;
-               } else {
-                       unsigned blocksize = ssrc->surface.bpe;
-
-                       switch (blocksize) {
-                       case 1:
-                               dst_templ.format = PIPE_FORMAT_R8_UNORM;
-                               src_templ.format = PIPE_FORMAT_R8_UNORM;
-                               break;
-                       case 2:
-                               dst_templ.format = PIPE_FORMAT_R8G8_UNORM;
-                               src_templ.format = PIPE_FORMAT_R8G8_UNORM;
-                               break;
-                       case 4:
-                               dst_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
-                               src_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
-                               break;
-                       case 8:
-                               dst_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
-                               src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
-                               break;
-                       case 16:
-                               dst_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
-                               src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
-                               break;
-                       default:
-                               fprintf(stderr, "Unhandled format %s with blocksize %u\n",
-                                       util_format_short_name(src->format), blocksize);
-                               assert(0);
-                       }
-               }
-       }
-
-       /* SNORM8 blitting has precision issues on some chips. Use the SINT
-        * equivalent instead, which doesn't force DCC decompression.
-        * Note that some chips avoid this issue by using SDMA.
-        */
-       if (util_format_is_snorm8(dst_templ.format)) {
-               dst_templ.format = src_templ.format =
-                       util_format_snorm8_to_sint8(dst_templ.format);
-       }
-
-       vi_disable_dcc_if_incompatible_format(sctx, dst, dst_level,
-                                             dst_templ.format);
-       vi_disable_dcc_if_incompatible_format(sctx, src, src_level,
-                                             src_templ.format);
-
-       /* Initialize the surface. */
-       dst_view = si_create_surface_custom(ctx, dst, &dst_templ,
-                                             dst_width0, dst_height0,
-                                             dst_width, dst_height);
-
-       /* Initialize the sampler view. */
-       src_view = si_create_sampler_view_custom(ctx, src, &src_templ,
-                                                src_width0, src_height0,
-                                                src_force_level);
-
-       u_box_3d(dstx, dsty, dstz, abs(src_box->width), abs(src_box->height),
-                abs(src_box->depth), &dstbox);
-
-       /* Copy. */
-       si_blitter_begin(sctx, SI_COPY);
-       util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox,
-                                 src_view, src_box, src_width0, src_height0,
-                                 PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
-                                 false);
-       si_blitter_end(sctx);
-
-       pipe_surface_reference(&dst_view, NULL);
-       pipe_sampler_view_reference(&src_view, NULL);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *ssrc = (struct si_texture *)src;
+   struct si_texture *sdst = (struct si_texture *)dst;
+   struct pipe_surface *dst_view, dst_templ;
+   struct pipe_sampler_view src_templ, *src_view;
+   unsigned dst_width, dst_height, src_width0, src_height0;
+   unsigned dst_width0, dst_height0, src_force_level = 0;
+   struct pipe_box sbox, dstbox;
+
+   /* Handle buffers first. */
+   if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
+      si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width);
+      return;
+   }
+
+   if (!util_format_is_compressed(src->format) && !util_format_is_compressed(dst->format) &&
+       !util_format_is_depth_or_stencil(src->format) && src->nr_samples <= 1 &&
+       !sdst->surface.dcc_offset &&
+       !(dst->target != src->target &&
+         (src->target == PIPE_TEXTURE_1D_ARRAY || dst->target == PIPE_TEXTURE_1D_ARRAY))) {
+      si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, src_box);
+      return;
+   }
+
+   assert(u_max_sample(dst) == u_max_sample(src));
+
+   /* The driver doesn't decompress resources automatically while
+    * u_blitter is rendering. */
+   si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z,
+                             src_box->z + src_box->depth - 1);
+
+   dst_width = u_minify(dst->width0, dst_level);
+   dst_height = u_minify(dst->height0, dst_level);
+   dst_width0 = dst->width0;
+   dst_height0 = dst->height0;
+   src_width0 = src->width0;
+   src_height0 = src->height0;
+
+   util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
+   util_blitter_default_src_texture(sctx->blitter, &src_templ, src, src_level);
+
+   if (util_format_is_compressed(src->format) || util_format_is_compressed(dst->format)) {
+      unsigned blocksize = ssrc->surface.bpe;
+
+      if (blocksize == 8)
+         src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
+      else
+         src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
+      dst_templ.format = src_templ.format;
+
+      dst_width = util_format_get_nblocksx(dst->format, dst_width);
+      dst_height = util_format_get_nblocksy(dst->format, dst_height);
+      dst_width0 = util_format_get_nblocksx(dst->format, dst_width0);
+      dst_height0 = util_format_get_nblocksy(dst->format, dst_height0);
+      src_width0 = util_format_get_nblocksx(src->format, src_width0);
+      src_height0 = util_format_get_nblocksy(src->format, src_height0);
+
+      dstx = util_format_get_nblocksx(dst->format, dstx);
+      dsty = util_format_get_nblocksy(dst->format, dsty);
+
+      sbox.x = util_format_get_nblocksx(src->format, src_box->x);
+      sbox.y = util_format_get_nblocksy(src->format, src_box->y);
+      sbox.z = src_box->z;
+      sbox.width = util_format_get_nblocksx(src->format, src_box->width);
+      sbox.height = util_format_get_nblocksy(src->format, src_box->height);
+      sbox.depth = src_box->depth;
+      src_box = &sbox;
+
+      src_force_level = src_level;
+   } else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src)) {
+      if (util_format_is_subsampled_422(src->format)) {
+         src_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
+         dst_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
+
+         dst_width = util_format_get_nblocksx(dst->format, dst_width);
+         dst_width0 = util_format_get_nblocksx(dst->format, dst_width0);
+         src_width0 = util_format_get_nblocksx(src->format, src_width0);
+
+         dstx = util_format_get_nblocksx(dst->format, dstx);
+
+         sbox = *src_box;
+         sbox.x = util_format_get_nblocksx(src->format, src_box->x);
+         sbox.width = util_format_get_nblocksx(src->format, src_box->width);
+         src_box = &sbox;
+      } else {
+         unsigned blocksize = ssrc->surface.bpe;
+
+         switch (blocksize) {
+         case 1:
+            dst_templ.format = PIPE_FORMAT_R8_UNORM;
+            src_templ.format = PIPE_FORMAT_R8_UNORM;
+            break;
+         case 2:
+            dst_templ.format = PIPE_FORMAT_R8G8_UNORM;
+            src_templ.format = PIPE_FORMAT_R8G8_UNORM;
+            break;
+         case 4:
+            dst_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
+            src_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
+            break;
+         case 8:
+            dst_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
+            src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
+            break;
+         case 16:
+            dst_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
+            src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
+            break;
+         default:
+            fprintf(stderr, "Unhandled format %s with blocksize %u\n",
+                    util_format_short_name(src->format), blocksize);
+            assert(0);
+         }
+      }
+   }
+
+   /* SNORM8 blitting has precision issues on some chips. Use the SINT
+    * equivalent instead, which doesn't force DCC decompression.
+    * Note that some chips avoid this issue by using SDMA.
+    */
+   if (util_format_is_snorm8(dst_templ.format)) {
+      dst_templ.format = src_templ.format = util_format_snorm8_to_sint8(dst_templ.format);
+   }
+
+   vi_disable_dcc_if_incompatible_format(sctx, dst, dst_level, dst_templ.format);
+   vi_disable_dcc_if_incompatible_format(sctx, src, src_level, src_templ.format);
+
+   /* Initialize the surface. */
+   dst_view = si_create_surface_custom(ctx, dst, &dst_templ, dst_width0, dst_height0, dst_width,
+                                       dst_height);
+
+   /* Initialize the sampler view. */
+   src_view =
+      si_create_sampler_view_custom(ctx, src, &src_templ, src_width0, src_height0, src_force_level);
+
+   u_box_3d(dstx, dsty, dstz, abs(src_box->width), abs(src_box->height), abs(src_box->depth),
+            &dstbox);
+
+   /* Copy. */
+   si_blitter_begin(sctx, SI_COPY);
+   util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox, src_view, src_box, src_width0,
+                             src_height0, PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, false);
+   si_blitter_end(sctx);
+
+   pipe_surface_reference(&dst_view, NULL);
+   pipe_sampler_view_reference(&src_view, NULL);
  }
  
-static void si_do_CB_resolve(struct si_context *sctx,
-                            const struct pipe_blit_info *info,
-                            struct pipe_resource *dst,
-                            unsigned dst_level, unsigned dst_z,
-                            enum pipe_format format)
+static void si_do_CB_resolve(struct si_context *sctx, const struct pipe_blit_info *info,
+                             struct pipe_resource *dst, unsigned dst_level, unsigned dst_z,
+                             enum pipe_format format)
  {
-       /* Required before and after CB_RESOLVE. */
-       sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
-
-       si_blitter_begin(sctx, SI_COLOR_RESOLVE |
-                        (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
-       util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, dst_z,
-                                         info->src.resource, info->src.box.z,
-                                         ~0, sctx->custom_blend_resolve,
-                                         format);
-       si_blitter_end(sctx);
-
-       /* Flush caches for possible texturing. */
-       si_make_CB_shader_coherent(sctx, 1, false, true /* no DCC */);
+   /* Required before and after CB_RESOLVE. */
+   sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
+   si_blitter_begin(
+      sctx, SI_COLOR_RESOLVE | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
+   util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, dst_z, info->src.resource,
+                                     info->src.box.z, ~0, sctx->custom_blend_resolve, format);
+   si_blitter_end(sctx);
+
+   /* Flush caches for possible texturing. */
+   si_make_CB_shader_coherent(sctx, 1, false, true /* no DCC */);
  }
  
-static bool do_hardware_msaa_resolve(struct pipe_context *ctx,
-                                    const struct pipe_blit_info *info)
+static bool do_hardware_msaa_resolve(struct pipe_context *ctx, const struct pipe_blit_info *info)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_texture *src = (struct si_texture*)info->src.resource;
-       struct si_texture *dst = (struct si_texture*)info->dst.resource;
-       ASSERTED struct si_texture *stmp;
-       unsigned dst_width = u_minify(info->dst.resource->width0, info->dst.level);
-       unsigned dst_height = u_minify(info->dst.resource->height0, info->dst.level);
-       enum pipe_format format = info->src.format;
-       struct pipe_resource *tmp, templ;
-       struct pipe_blit_info blit;
-
-       /* Check basic requirements for hw resolve. */
-       if (!(info->src.resource->nr_samples > 1 &&
-             info->dst.resource->nr_samples <= 1 &&
-             !util_format_is_pure_integer(format) &&
-             !util_format_is_depth_or_stencil(format) &&
-             util_max_layer(info->src.resource, 0) == 0))
-               return false;
-
-       /* Hardware MSAA resolve doesn't work if SPI format = NORM16_ABGR and
-        * the format is R16G16. Use R16A16, which does work.
-        */
-       if (format == PIPE_FORMAT_R16G16_UNORM)
-               format = PIPE_FORMAT_R16A16_UNORM;
-       if (format == PIPE_FORMAT_R16G16_SNORM)
-               format = PIPE_FORMAT_R16A16_SNORM;
-
-       /* Check the remaining requirements for hw resolve. */
-       if (util_max_layer(info->dst.resource, info->dst.level) == 0 &&
-           !info->scissor_enable &&
-           (info->mask & PIPE_MASK_RGBA) == PIPE_MASK_RGBA &&
-           util_is_format_compatible(util_format_description(info->src.format),
-                                     util_format_description(info->dst.format)) &&
-           dst_width == info->src.resource->width0 &&
-           dst_height == info->src.resource->height0 &&
-           info->dst.box.x == 0 &&
-           info->dst.box.y == 0 &&
-           info->dst.box.width == dst_width &&
-           info->dst.box.height == dst_height &&
-           info->dst.box.depth == 1 &&
-           info->src.box.x == 0 &&
-           info->src.box.y == 0 &&
-           info->src.box.width == dst_width &&
-           info->src.box.height == dst_height &&
-           info->src.box.depth == 1 &&
-           !dst->surface.is_linear &&
-           (!dst->cmask_buffer || !dst->dirty_level_mask)) { /* dst cannot be fast-cleared */
-               /* Check the last constraint. */
-               if (src->surface.micro_tile_mode != dst->surface.micro_tile_mode) {
-                       /* The next fast clear will switch to this mode to
-                        * get direct hw resolve next time if the mode is
-                        * different now.
-                        *
-                        * TODO-GFX10: This does not work in GFX10 because MSAA
-                        * is restricted to 64KB_R_X and 64KB_Z_X swizzle modes.
-                        * In some cases we could change the swizzle of the
-                        * destination texture instead, but the more general
-                        * solution is to implement compute shader resolve.
-                        */
-                       src->last_msaa_resolve_target_micro_mode =
-                               dst->surface.micro_tile_mode;
-                       goto resolve_to_temp;
-               }
-
-               /* Resolving into a surface with DCC is unsupported. Since
-                * it's being overwritten anyway, clear it to uncompressed.
-                * This is still the fastest codepath even with this clear.
-                */
-               if (vi_dcc_enabled(dst, info->dst.level)) {
-                       if (!vi_dcc_clear_level(sctx, dst, info->dst.level,
-                                               DCC_UNCOMPRESSED))
-                               goto resolve_to_temp;
-
-                       dst->dirty_level_mask &= ~(1 << info->dst.level);
-               }
-
-               /* Resolve directly from src to dst. */
-               si_do_CB_resolve(sctx, info, info->dst.resource,
-                                info->dst.level, info->dst.box.z, format);
-               return true;
-       }
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *src = (struct si_texture *)info->src.resource;
+   struct si_texture *dst = (struct si_texture *)info->dst.resource;
+   ASSERTED struct si_texture *stmp;
+   unsigned dst_width = u_minify(info->dst.resource->width0, info->dst.level);
+   unsigned dst_height = u_minify(info->dst.resource->height0, info->dst.level);
+   enum pipe_format format = info->src.format;
+   struct pipe_resource *tmp, templ;
+   struct pipe_blit_info blit;
+
+   /* Check basic requirements for hw resolve. */
+   if (!(info->src.resource->nr_samples > 1 && info->dst.resource->nr_samples <= 1 &&
+         !util_format_is_pure_integer(format) && !util_format_is_depth_or_stencil(format) &&
+         util_max_layer(info->src.resource, 0) == 0))
+      return false;
+
+   /* Hardware MSAA resolve doesn't work if SPI format = NORM16_ABGR and
+    * the format is R16G16. Use R16A16, which does work.
+    */
+   if (format == PIPE_FORMAT_R16G16_UNORM)
+      format = PIPE_FORMAT_R16A16_UNORM;
+   if (format == PIPE_FORMAT_R16G16_SNORM)
+      format = PIPE_FORMAT_R16A16_SNORM;
+
+   /* Check the remaining requirements for hw resolve. */
+   if (util_max_layer(info->dst.resource, info->dst.level) == 0 && !info->scissor_enable &&
+       (info->mask & PIPE_MASK_RGBA) == PIPE_MASK_RGBA &&
+       util_is_format_compatible(util_format_description(info->src.format),
+                                 util_format_description(info->dst.format)) &&
+       dst_width == info->src.resource->width0 && dst_height == info->src.resource->height0 &&
+       info->dst.box.x == 0 && info->dst.box.y == 0 && info->dst.box.width == dst_width &&
+       info->dst.box.height == dst_height && info->dst.box.depth == 1 && info->src.box.x == 0 &&
+       info->src.box.y == 0 && info->src.box.width == dst_width &&
+       info->src.box.height == dst_height && info->src.box.depth == 1 && !dst->surface.is_linear &&
+       (!dst->cmask_buffer || !dst->dirty_level_mask)) { /* dst cannot be fast-cleared */
+      /* Check the last constraint. */
+      if (src->surface.micro_tile_mode != dst->surface.micro_tile_mode) {
+         /* The next fast clear will switch to this mode to
+          * get direct hw resolve next time if the mode is
+          * different now.
+          *
+          * TODO-GFX10: This does not work in GFX10 because MSAA
+          * is restricted to 64KB_R_X and 64KB_Z_X swizzle modes.
+          * In some cases we could change the swizzle of the
+          * destination texture instead, but the more general
+          * solution is to implement compute shader resolve.
+          */
+         src->last_msaa_resolve_target_micro_mode = dst->surface.micro_tile_mode;
+         goto resolve_to_temp;
+      }
+
+      /* Resolving into a surface with DCC is unsupported. Since
+       * it's being overwritten anyway, clear it to uncompressed.
+       * This is still the fastest codepath even with this clear.
+       */
+      if (vi_dcc_enabled(dst, info->dst.level)) {
+         if (!vi_dcc_clear_level(sctx, dst, info->dst.level, DCC_UNCOMPRESSED))
+            goto resolve_to_temp;
+
+         dst->dirty_level_mask &= ~(1 << info->dst.level);
+      }
+
+      /* Resolve directly from src to dst. */
+      si_do_CB_resolve(sctx, info, info->dst.resource, info->dst.level, info->dst.box.z, format);
+      return true;
+   }
  
  resolve_to_temp:
-       /* Shader-based resolve is VERY SLOW. Instead, resolve into
-        * a temporary texture and blit.
-        */
-       memset(&templ, 0, sizeof(templ));
-       templ.target = PIPE_TEXTURE_2D;
-       templ.format = info->src.resource->format;
-       templ.width0 = info->src.resource->width0;
-       templ.height0 = info->src.resource->height0;
-       templ.depth0 = 1;
-       templ.array_size = 1;
-       templ.usage = PIPE_USAGE_DEFAULT;
-       templ.flags = SI_RESOURCE_FLAG_FORCE_MSAA_TILING |
-                     SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE |
-                     SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(src->surface.micro_tile_mode) |
-                     SI_RESOURCE_FLAG_DISABLE_DCC;
-
-       /* The src and dst microtile modes must be the same. */
-       if (sctx->chip_class <= GFX8 &&
-           src->surface.micro_tile_mode == RADEON_MICRO_MODE_DISPLAY)
-               templ.bind = PIPE_BIND_SCANOUT;
-       else
-               templ.bind = 0;
-
-       tmp = ctx->screen->resource_create(ctx->screen, &templ);
-       if (!tmp)
-               return false;
-       stmp = (struct si_texture*)tmp;
-
-       assert(!stmp->surface.is_linear);
-       assert(src->surface.micro_tile_mode == stmp->surface.micro_tile_mode);
-
-       /* resolve */
-       si_do_CB_resolve(sctx, info, tmp, 0, 0, format);
-
-       /* blit */
-       blit = *info;
-       blit.src.resource = tmp;
-       blit.src.box.z = 0;
-
-       si_blitter_begin(sctx, SI_BLIT |
-                        (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
-       util_blitter_blit(sctx->blitter, &blit);
-       si_blitter_end(sctx);
-
-       pipe_resource_reference(&tmp, NULL);
-       return true;
+   /* Shader-based resolve is VERY SLOW. Instead, resolve into
+    * a temporary texture and blit.
+    */
+   memset(&templ, 0, sizeof(templ));
+   templ.target = PIPE_TEXTURE_2D;
+   templ.format = info->src.resource->format;
+   templ.width0 = info->src.resource->width0;
+   templ.height0 = info->src.resource->height0;
+   templ.depth0 = 1;
+   templ.array_size = 1;
+   templ.usage = PIPE_USAGE_DEFAULT;
+   templ.flags = SI_RESOURCE_FLAG_FORCE_MSAA_TILING | SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE |
+                 SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(src->surface.micro_tile_mode) |
+                 SI_RESOURCE_FLAG_DISABLE_DCC;
+
+   /* The src and dst microtile modes must be the same. */
+   if (sctx->chip_class <= GFX8 && src->surface.micro_tile_mode == RADEON_MICRO_MODE_DISPLAY)
+      templ.bind = PIPE_BIND_SCANOUT;
+   else
+      templ.bind = 0;
+
+   tmp = ctx->screen->resource_create(ctx->screen, &templ);
+   if (!tmp)
+      return false;
+   stmp = (struct si_texture *)tmp;
+
+   assert(!stmp->surface.is_linear);
+   assert(src->surface.micro_tile_mode == stmp->surface.micro_tile_mode);
+
+   /* resolve */
+   si_do_CB_resolve(sctx, info, tmp, 0, 0, format);
+
+   /* blit */
+   blit = *info;
+   blit.src.resource = tmp;
+   blit.src.box.z = 0;
+
+   si_blitter_begin(sctx, SI_BLIT | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
+   util_blitter_blit(sctx->blitter, &blit);
+   si_blitter_end(sctx);
+
+   pipe_resource_reference(&tmp, NULL);
+   return true;
  }
  
-static void si_blit(struct pipe_context *ctx,
-                   const struct pipe_blit_info *info)
+static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_texture *dst = (struct si_texture *)info->dst.resource;
-
-       if (do_hardware_msaa_resolve(ctx, info)) {
-               return;
-       }
-
-       /* Using SDMA for copying to a linear texture in GTT is much faster.
-        * This improves DRI PRIME performance.
-        *
-        * resource_copy_region can't do this yet, because dma_copy calls it
-        * on failure (recursion).
-        */
-       if (dst->surface.is_linear &&
-           util_can_blit_via_copy_region(info, false)) {
-               sctx->dma_copy(ctx, info->dst.resource, info->dst.level,
-                                info->dst.box.x, info->dst.box.y,
-                                info->dst.box.z,
-                                info->src.resource, info->src.level,
-                                &info->src.box);
-               return;
-       }
-
-       assert(util_blitter_is_blit_supported(sctx->blitter, info));
-
-       /* The driver doesn't decompress resources automatically while
-        * u_blitter is rendering. */
-       vi_disable_dcc_if_incompatible_format(sctx, info->src.resource,
-                                             info->src.level,
-                                             info->src.format);
-       vi_disable_dcc_if_incompatible_format(sctx, info->dst.resource,
-                                             info->dst.level,
-                                             info->dst.format);
-       si_decompress_subresource(ctx, info->src.resource, PIPE_MASK_RGBAZS,
-                                 info->src.level,
-                                 info->src.box.z,
-                                 info->src.box.z + info->src.box.depth - 1);
-
-       if (sctx->screen->debug_flags & DBG(FORCE_SDMA) &&
-           util_try_blit_via_copy_region(ctx, info))
-               return;
-
-       si_blitter_begin(sctx, SI_BLIT |
-                        (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
-       util_blitter_blit(sctx->blitter, info);
-       si_blitter_end(sctx);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *dst = (struct si_texture *)info->dst.resource;
+
+   if (do_hardware_msaa_resolve(ctx, info)) {
+      return;
+   }
+
+   /* Using SDMA for copying to a linear texture in GTT is much faster.
+    * This improves DRI PRIME performance.
+    *
+    * resource_copy_region can't do this yet, because dma_copy calls it
+    * on failure (recursion).
+    */
+   if (dst->surface.is_linear && util_can_blit_via_copy_region(info, false)) {
+      sctx->dma_copy(ctx, info->dst.resource, info->dst.level, info->dst.box.x, info->dst.box.y,
+                     info->dst.box.z, info->src.resource, info->src.level, &info->src.box);
+      return;
+   }
+
+   assert(util_blitter_is_blit_supported(sctx->blitter, info));
+
+   /* The driver doesn't decompress resources automatically while
+    * u_blitter is rendering. */
+   vi_disable_dcc_if_incompatible_format(sctx, info->src.resource, info->src.level,
+                                         info->src.format);
+   vi_disable_dcc_if_incompatible_format(sctx, info->dst.resource, info->dst.level,
+                                         info->dst.format);
+   si_decompress_subresource(ctx, info->src.resource, PIPE_MASK_RGBAZS, info->src.level,
+                             info->src.box.z, info->src.box.z + info->src.box.depth - 1);
+
+   if (sctx->screen->debug_flags & DBG(FORCE_SDMA) && util_try_blit_via_copy_region(ctx, info))
+      return;
+
+   si_blitter_begin(sctx, SI_BLIT | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
+   util_blitter_blit(sctx->blitter, info);
+   si_blitter_end(sctx);
  }
  
-static bool si_generate_mipmap(struct pipe_context *ctx,
-                              struct pipe_resource *tex,
-                              enum pipe_format format,
-                              unsigned base_level, unsigned last_level,
-                              unsigned first_layer, unsigned last_layer)
+static bool si_generate_mipmap(struct pipe_context *ctx, struct pipe_resource *tex,
+                               enum pipe_format format, unsigned base_level, unsigned last_level,
+                               unsigned first_layer, unsigned last_layer)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_texture *stex = (struct si_texture *)tex;
-
-       if (!util_blitter_is_copy_supported(sctx->blitter, tex, tex))
-               return false;
-
-       /* The driver doesn't decompress resources automatically while
-        * u_blitter is rendering. */
-       vi_disable_dcc_if_incompatible_format(sctx, tex, base_level,
-                                             format);
-       si_decompress_subresource(ctx, tex, PIPE_MASK_RGBAZS,
-                                 base_level, first_layer, last_layer);
-
-       /* Clear dirty_level_mask for the levels that will be overwritten. */
-       assert(base_level < last_level);
-       stex->dirty_level_mask &= ~u_bit_consecutive(base_level + 1,
-                                                    last_level - base_level);
-
-       sctx->generate_mipmap_for_depth = stex->is_depth;
-
-       si_blitter_begin(sctx, SI_BLIT | SI_DISABLE_RENDER_COND);
-       util_blitter_generate_mipmap(sctx->blitter, tex, format,
-                                    base_level, last_level,
-                                    first_layer, last_layer);
-       si_blitter_end(sctx);
-
-       sctx->generate_mipmap_for_depth = false;
-       return true;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *stex = (struct si_texture *)tex;
+
+   if (!util_blitter_is_copy_supported(sctx->blitter, tex, tex))
+      return false;
+
+   /* The driver doesn't decompress resources automatically while
+    * u_blitter is rendering. */
+   vi_disable_dcc_if_incompatible_format(sctx, tex, base_level, format);
+   si_decompress_subresource(ctx, tex, PIPE_MASK_RGBAZS, base_level, first_layer, last_layer);
+
+   /* Clear dirty_level_mask for the levels that will be overwritten. */
+   assert(base_level < last_level);
+   stex->dirty_level_mask &= ~u_bit_consecutive(base_level + 1, last_level - base_level);
+
+   sctx->generate_mipmap_for_depth = stex->is_depth;
+
+   si_blitter_begin(sctx, SI_BLIT | SI_DISABLE_RENDER_COND);
+   util_blitter_generate_mipmap(sctx->blitter, tex, format, base_level, last_level, first_layer,
+                                last_layer);
+   si_blitter_end(sctx);
+
+   sctx->generate_mipmap_for_depth = false;
+   return true;
  }
  
-static void si_flush_resource(struct pipe_context *ctx,
-                             struct pipe_resource *res)
+static void si_flush_resource(struct pipe_context *ctx, struct pipe_resource *res)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_texture *tex = (struct si_texture*)res;
-
-       assert(res->target != PIPE_BUFFER);
-       assert(!tex->dcc_separate_buffer || tex->dcc_gather_statistics);
-
-       /* st/dri calls flush twice per frame (not a bug), this prevents double
-        * decompression. */
-       if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty)
-               return;
-
-       if (!tex->is_depth && (tex->cmask_buffer || tex->surface.dcc_offset)) {
-               si_blit_decompress_color(sctx, tex, 0, res->last_level,
-                                        0, util_max_layer(res, 0),
-                                        tex->dcc_separate_buffer != NULL, false);
-
-               if (tex->surface.display_dcc_offset && tex->displayable_dcc_dirty) {
-                       si_retile_dcc(sctx, tex);
-                       tex->displayable_dcc_dirty = false;
-               }
-       }
-
-       /* Always do the analysis even if DCC is disabled at the moment. */
-       if (tex->dcc_gather_statistics) {
-               bool separate_dcc_dirty = tex->separate_dcc_dirty;
-
-               /* If the color buffer hasn't been unbound and fast clear hasn't
-                * been used, separate_dcc_dirty is false, but there may have been
-                * new rendering. Check if the color buffer is bound and assume
-                * it's dirty.
-                *
-                * Note that DRI2 never unbinds window colorbuffers, which means
-                * the DCC pipeline statistics query would never be re-set and would
-                * keep adding new results until all free memory is exhausted if we
-                * didn't do this.
-                */
-               if (!separate_dcc_dirty) {
-                       for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-                               if (sctx->framebuffer.state.cbufs[i] &&
-                                   sctx->framebuffer.state.cbufs[i]->texture == res) {
-                                       separate_dcc_dirty = true;
-                                       break;
-                               }
-                       }
-               }
-
-               if (separate_dcc_dirty) {
-                       tex->separate_dcc_dirty = false;
-                       vi_separate_dcc_process_and_reset_stats(ctx, tex);
-               }
-       }
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *tex = (struct si_texture *)res;
+
+   assert(res->target != PIPE_BUFFER);
+   assert(!tex->dcc_separate_buffer || tex->dcc_gather_statistics);
+
+   /* st/dri calls flush twice per frame (not a bug), this prevents double
+    * decompression. */
+   if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty)
+      return;
+
+   if (!tex->is_depth && (tex->cmask_buffer || tex->surface.dcc_offset)) {
+      si_blit_decompress_color(sctx, tex, 0, res->last_level, 0, util_max_layer(res, 0),
+                               tex->dcc_separate_buffer != NULL, false);
+
+      if (tex->surface.display_dcc_offset && tex->displayable_dcc_dirty) {
+         si_retile_dcc(sctx, tex);
+         tex->displayable_dcc_dirty = false;
+      }
+   }
+
+   /* Always do the analysis even if DCC is disabled at the moment. */
+   if (tex->dcc_gather_statistics) {
+      bool separate_dcc_dirty = tex->separate_dcc_dirty;
+
+      /* If the color buffer hasn't been unbound and fast clear hasn't
+       * been used, separate_dcc_dirty is false, but there may have been
+       * new rendering. Check if the color buffer is bound and assume
+       * it's dirty.
+       *
+       * Note that DRI2 never unbinds window colorbuffers, which means
+       * the DCC pipeline statistics query would never be re-set and would
+       * keep adding new results until all free memory is exhausted if we
+       * didn't do this.
+       */
+      if (!separate_dcc_dirty) {
+         for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+            if (sctx->framebuffer.state.cbufs[i] &&
+                sctx->framebuffer.state.cbufs[i]->texture == res) {
+               separate_dcc_dirty = true;
+               break;
+            }
+         }
+      }
+
+      if (separate_dcc_dirty) {
+         tex->separate_dcc_dirty = false;
+         vi_separate_dcc_process_and_reset_stats(ctx, tex);
+      }
+   }
  }
  
  void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex)
  {
-       /* If graphics is disabled, we can't decompress DCC, but it shouldn't
-        * be compressed either. The caller should simply discard it.
-        */
-       if (!tex->surface.dcc_offset || !sctx->has_graphics)
-               return;
-
-       si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level,
-                                0, util_max_layer(&tex->buffer.b.b, 0),
-                                true, false);
+   /* If graphics is disabled, we can't decompress DCC, but it shouldn't
+    * be compressed either. The caller should simply discard it.
+    */
+   if (!tex->surface.dcc_offset || !sctx->has_graphics)
+      return;
+
+   si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level, 0,
+                            util_max_layer(&tex->buffer.b.b, 0), true, false);
  }
  
  void si_init_blit_functions(struct si_context *sctx)
  {
-       sctx->b.resource_copy_region = si_resource_copy_region;
+   sctx->b.resource_copy_region = si_resource_copy_region;
  
-       if (sctx->has_graphics) {
-               sctx->b.blit = si_blit;
-               sctx->b.flush_resource = si_flush_resource;
-               sctx->b.generate_mipmap = si_generate_mipmap;
-       }
+   if (sctx->has_graphics) {
+      sctx->b.blit = si_blit;
+      sctx->b.flush_resource = si_flush_resource;
+      sctx->b.generate_mipmap = si_generate_mipmap;
+   }
  }
diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c

index 38d8e9456c29014fd86eaf0295b152f9d141cd66..eb71636d3462477bcbf9a928e5b80ca5187fd8fd 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -24,240 +24,227 @@
  
  #include "radeonsi/si_pipe.h"
  #include "util/u_memory.h"
-#include "util/u_upload_mgr.h"
  #include "util/u_transfer.h"
+#include "util/u_upload_mgr.h"
+
  #include <inttypes.h>
  #include <stdio.h>
  
-bool si_rings_is_buffer_referenced(struct si_context *sctx,
-                                  struct pb_buffer *buf,
-                                  enum radeon_bo_usage usage)
+bool si_rings_is_buffer_referenced(struct si_context *sctx, struct pb_buffer *buf,
+                                   enum radeon_bo_usage usage)
  {
-       if (sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, buf, usage)) {
-               return true;
-       }
-       if (radeon_emitted(sctx->sdma_cs, 0) &&
-           sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, buf, usage)) {
-               return true;
-       }
-       return false;
+   if (sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, buf, usage)) {
+      return true;
+   }
+   if (radeon_emitted(sctx->sdma_cs, 0) &&
+       sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, buf, usage)) {
+      return true;
+   }
+   return false;
  }
  
-void *si_buffer_map_sync_with_rings(struct si_context *sctx,
-                                   struct si_resource *resource,
-                                   unsigned usage)
+void *si_buffer_map_sync_with_rings(struct si_context *sctx, struct si_resource *resource,
+                                    unsigned usage)
  {
-       enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE;
-       bool busy = false;
-
-       assert(!(resource->flags & RADEON_FLAG_SPARSE));
-
-       if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
-               return sctx->ws->buffer_map(resource->buf, NULL, usage);
-       }
-
-       if (!(usage & PIPE_TRANSFER_WRITE)) {
-               /* have to wait for the last write */
-               rusage = RADEON_USAGE_WRITE;
-       }
-
-       if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) &&
-           sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs,
-                                               resource->buf, rusage)) {
-               if (usage & PIPE_TRANSFER_DONTBLOCK) {
-                       si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-                       return NULL;
-               } else {
-                       si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-                       busy = true;
-               }
-       }
-       if (radeon_emitted(sctx->sdma_cs, 0) &&
-           sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs,
-                                               resource->buf, rusage)) {
-               if (usage & PIPE_TRANSFER_DONTBLOCK) {
-                       si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
-                       return NULL;
-               } else {
-                       si_flush_dma_cs(sctx, 0, NULL);
-                       busy = true;
-               }
-       }
-
-       if (busy || !sctx->ws->buffer_wait(resource->buf, 0, rusage)) {
-               if (usage & PIPE_TRANSFER_DONTBLOCK) {
-                       return NULL;
-               } else {
-                       /* We will be wait for the GPU. Wait for any offloaded
-                        * CS flush to complete to avoid busy-waiting in the winsys. */
-                       sctx->ws->cs_sync_flush(sctx->gfx_cs);
-                       if (sctx->sdma_cs)
-                               sctx->ws->cs_sync_flush(sctx->sdma_cs);
-               }
-       }
-
-       /* Setting the CS to NULL will prevent doing checks we have done already. */
-       return sctx->ws->buffer_map(resource->buf, NULL, usage);
+   enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE;
+   bool busy = false;
+
+   assert(!(resource->flags & RADEON_FLAG_SPARSE));
+
+   if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
+      return sctx->ws->buffer_map(resource->buf, NULL, usage);
+   }
+
+   if (!(usage & PIPE_TRANSFER_WRITE)) {
+      /* have to wait for the last write */
+      rusage = RADEON_USAGE_WRITE;
+   }
+
+   if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) &&
+       sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, resource->buf, rusage)) {
+      if (usage & PIPE_TRANSFER_DONTBLOCK) {
+         si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+         return NULL;
+      } else {
+         si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+         busy = true;
+      }
+   }
+   if (radeon_emitted(sctx->sdma_cs, 0) &&
+       sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, resource->buf, rusage)) {
+      if (usage & PIPE_TRANSFER_DONTBLOCK) {
+         si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+         return NULL;
+      } else {
+         si_flush_dma_cs(sctx, 0, NULL);
+         busy = true;
+      }
+   }
+
+   if (busy || !sctx->ws->buffer_wait(resource->buf, 0, rusage)) {
+      if (usage & PIPE_TRANSFER_DONTBLOCK) {
+         return NULL;
+      } else {
+         /* We will be wait for the GPU. Wait for any offloaded
+          * CS flush to complete to avoid busy-waiting in the winsys. */
+         sctx->ws->cs_sync_flush(sctx->gfx_cs);
+         if (sctx->sdma_cs)
+            sctx->ws->cs_sync_flush(sctx->sdma_cs);
+      }
+   }
+
+   /* Setting the CS to NULL will prevent doing checks we have done already. */
+   return sctx->ws->buffer_map(resource->buf, NULL, usage);
  }
  
-void si_init_resource_fields(struct si_screen *sscreen,
-                            struct si_resource *res,
-                            uint64_t size, unsigned alignment)
+void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res, uint64_t size,
+                             unsigned alignment)
  {
-       struct si_texture *tex = (struct si_texture*)res;
-
-       res->bo_size = size;
-       res->bo_alignment = alignment;
-       res->flags = 0;
-       res->texture_handle_allocated = false;
-       res->image_handle_allocated = false;
-
-       switch (res->b.b.usage) {
-       case PIPE_USAGE_STREAM:
-               res->flags = RADEON_FLAG_GTT_WC;
-               /* fall through */
-       case PIPE_USAGE_STAGING:
-               /* Transfers are likely to occur more often with these
-                * resources. */
-               res->domains = RADEON_DOMAIN_GTT;
-               break;
-       case PIPE_USAGE_DYNAMIC:
-               /* Older kernels didn't always flush the HDP cache before
-                * CS execution
-                */
-               if (!sscreen->info.kernel_flushes_hdp_before_ib) {
-                       res->domains = RADEON_DOMAIN_GTT;
-                       res->flags |= RADEON_FLAG_GTT_WC;
-                       break;
-               }
-               /* fall through */
-       case PIPE_USAGE_DEFAULT:
-       case PIPE_USAGE_IMMUTABLE:
-       default:
-               /* Not listing GTT here improves performance in some
-                * apps. */
-               res->domains = RADEON_DOMAIN_VRAM;
-               res->flags |= RADEON_FLAG_GTT_WC;
-               break;
-       }
-
-       if (res->b.b.target == PIPE_BUFFER &&
-           res->b.b.flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
-               /* Use GTT for all persistent mappings with older
-                * kernels, because they didn't always flush the HDP
-                * cache before CS execution.
-                *
-                * Write-combined CPU mappings are fine, the kernel
-                * ensures all CPU writes finish before the GPU
-                * executes a command stream.
-                *
-                * radeon doesn't have good BO move throttling, so put all
-                * persistent buffers into GTT to prevent VRAM CPU page faults.
-                */
-               if (!sscreen->info.kernel_flushes_hdp_before_ib ||
-                   !sscreen->info.is_amdgpu)
-                       res->domains = RADEON_DOMAIN_GTT;
-       }
-
-       /* Tiled textures are unmappable. Always put them in VRAM. */
-       if ((res->b.b.target != PIPE_BUFFER && !tex->surface.is_linear) ||
-           res->b.b.flags & SI_RESOURCE_FLAG_UNMAPPABLE) {
-               res->domains = RADEON_DOMAIN_VRAM;
-               res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
-                        RADEON_FLAG_GTT_WC;
-       }
-
-       /* Displayable and shareable surfaces are not suballocated. */
-       if (res->b.b.bind & (PIPE_BIND_SHARED | PIPE_BIND_SCANOUT))
-               res->flags |= RADEON_FLAG_NO_SUBALLOC; /* shareable */
-       else
-               res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
-
-       if (sscreen->debug_flags & DBG(NO_WC))
-               res->flags &= ~RADEON_FLAG_GTT_WC;
-
-       if (res->b.b.flags & SI_RESOURCE_FLAG_READ_ONLY)
-               res->flags |= RADEON_FLAG_READ_ONLY;
-
-       if (res->b.b.flags & SI_RESOURCE_FLAG_32BIT)
-               res->flags |= RADEON_FLAG_32BIT;
-
-       /* Set expected VRAM and GART usage for the buffer. */
-       res->vram_usage = 0;
-       res->gart_usage = 0;
-       res->max_forced_staging_uploads = 0;
-       res->b.max_forced_staging_uploads = 0;
-
-       if (res->domains & RADEON_DOMAIN_VRAM) {
-               res->vram_usage = size;
-
-               res->max_forced_staging_uploads =
-               res->b.max_forced_staging_uploads =
-                       sscreen->info.has_dedicated_vram &&
-                       size >= sscreen->info.vram_vis_size / 4 ? 1 : 0;
-       } else if (res->domains & RADEON_DOMAIN_GTT) {
-               res->gart_usage = size;
-       }
+   struct si_texture *tex = (struct si_texture *)res;
+
+   res->bo_size = size;
+   res->bo_alignment = alignment;
+   res->flags = 0;
+   res->texture_handle_allocated = false;
+   res->image_handle_allocated = false;
+
+   switch (res->b.b.usage) {
+   case PIPE_USAGE_STREAM:
+      res->flags = RADEON_FLAG_GTT_WC;
+      /* fall through */
+   case PIPE_USAGE_STAGING:
+      /* Transfers are likely to occur more often with these
+       * resources. */
+      res->domains = RADEON_DOMAIN_GTT;
+      break;
+   case PIPE_USAGE_DYNAMIC:
+      /* Older kernels didn't always flush the HDP cache before
+       * CS execution
+       */
+      if (!sscreen->info.kernel_flushes_hdp_before_ib) {
+         res->domains = RADEON_DOMAIN_GTT;
+         res->flags |= RADEON_FLAG_GTT_WC;
+         break;
+      }
+      /* fall through */
+   case PIPE_USAGE_DEFAULT:
+   case PIPE_USAGE_IMMUTABLE:
+   default:
+      /* Not listing GTT here improves performance in some
+       * apps. */
+      res->domains = RADEON_DOMAIN_VRAM;
+      res->flags |= RADEON_FLAG_GTT_WC;
+      break;
+   }
+
+   if (res->b.b.target == PIPE_BUFFER && res->b.b.flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
+      /* Use GTT for all persistent mappings with older
+       * kernels, because they didn't always flush the HDP
+       * cache before CS execution.
+       *
+       * Write-combined CPU mappings are fine, the kernel
+       * ensures all CPU writes finish before the GPU
+       * executes a command stream.
+       *
+       * radeon doesn't have good BO move throttling, so put all
+       * persistent buffers into GTT to prevent VRAM CPU page faults.
+       */
+      if (!sscreen->info.kernel_flushes_hdp_before_ib || !sscreen->info.is_amdgpu)
+         res->domains = RADEON_DOMAIN_GTT;
+   }
+
+   /* Tiled textures are unmappable. Always put them in VRAM. */
+   if ((res->b.b.target != PIPE_BUFFER && !tex->surface.is_linear) ||
+       res->b.b.flags & SI_RESOURCE_FLAG_UNMAPPABLE) {
+      res->domains = RADEON_DOMAIN_VRAM;
+      res->flags |= RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_GTT_WC;
+   }
+
+   /* Displayable and shareable surfaces are not suballocated. */
+   if (res->b.b.bind & (PIPE_BIND_SHARED | PIPE_BIND_SCANOUT))
+      res->flags |= RADEON_FLAG_NO_SUBALLOC; /* shareable */
+   else
+      res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
+
+   if (sscreen->debug_flags & DBG(NO_WC))
+      res->flags &= ~RADEON_FLAG_GTT_WC;
+
+   if (res->b.b.flags & SI_RESOURCE_FLAG_READ_ONLY)
+      res->flags |= RADEON_FLAG_READ_ONLY;
+
+   if (res->b.b.flags & SI_RESOURCE_FLAG_32BIT)
+      res->flags |= RADEON_FLAG_32BIT;
+
+   /* Set expected VRAM and GART usage for the buffer. */
+   res->vram_usage = 0;
+   res->gart_usage = 0;
+   res->max_forced_staging_uploads = 0;
+   res->b.max_forced_staging_uploads = 0;
+
+   if (res->domains & RADEON_DOMAIN_VRAM) {
+      res->vram_usage = size;
+
+      res->max_forced_staging_uploads = res->b.max_forced_staging_uploads =
+         sscreen->info.has_dedicated_vram && size >= sscreen->info.vram_vis_size / 4 ? 1 : 0;
+   } else if (res->domains & RADEON_DOMAIN_GTT) {
+      res->gart_usage = size;
+   }
  }
  
-bool si_alloc_resource(struct si_screen *sscreen,
-                      struct si_resource *res)
+bool si_alloc_resource(struct si_screen *sscreen, struct si_resource *res)
  {
-       struct pb_buffer *old_buf, *new_buf;
-
-       /* Allocate a new resource. */
-       new_buf = sscreen->ws->buffer_create(sscreen->ws, res->bo_size,
-                                            res->bo_alignment,
-                                            res->domains, res->flags);
-       if (!new_buf) {
-               return false;
-       }
-
-       /* Replace the pointer such that if res->buf wasn't NULL, it won't be
-        * NULL. This should prevent crashes with multiple contexts using
-        * the same buffer where one of the contexts invalidates it while
-        * the others are using it. */
-       old_buf = res->buf;
-       res->buf = new_buf; /* should be atomic */
-       res->gpu_address = sscreen->ws->buffer_get_virtual_address(res->buf);
-
-       if (res->flags & RADEON_FLAG_32BIT) {
-               uint64_t start = res->gpu_address;
-               uint64_t last = start + res->bo_size - 1;
-               (void)start;
-               (void)last;
-
-               assert((start >> 32) == sscreen->info.address32_hi);
-               assert((last >> 32) == sscreen->info.address32_hi);
-       }
-
-       pb_reference(&old_buf, NULL);
-
-       util_range_set_empty(&res->valid_buffer_range);
-       res->TC_L2_dirty = false;
-
-       /* Print debug information. */
-       if (sscreen->debug_flags & DBG(VM) && res->b.b.target == PIPE_BUFFER) {
-               fprintf(stderr, "VM start=0x%"PRIX64"  end=0x%"PRIX64" | Buffer %"PRIu64" bytes\n",
-                       res->gpu_address, res->gpu_address + res->buf->size,
-                       res->buf->size);
-       }
-
-       if (res->b.b.flags & SI_RESOURCE_FLAG_CLEAR)
-               si_screen_clear_buffer(sscreen, &res->b.b, 0, res->bo_size, 0);
-
-       return true;
+   struct pb_buffer *old_buf, *new_buf;
+
+   /* Allocate a new resource. */
+   new_buf = sscreen->ws->buffer_create(sscreen->ws, res->bo_size, res->bo_alignment, res->domains,
+                                        res->flags);
+   if (!new_buf) {
+      return false;
+   }
+
+   /* Replace the pointer such that if res->buf wasn't NULL, it won't be
+    * NULL. This should prevent crashes with multiple contexts using
+    * the same buffer where one of the contexts invalidates it while
+    * the others are using it. */
+   old_buf = res->buf;
+   res->buf = new_buf; /* should be atomic */
+   res->gpu_address = sscreen->ws->buffer_get_virtual_address(res->buf);
+
+   if (res->flags & RADEON_FLAG_32BIT) {
+      uint64_t start = res->gpu_address;
+      uint64_t last = start + res->bo_size - 1;
+      (void)start;
+      (void)last;
+
+      assert((start >> 32) == sscreen->info.address32_hi);
+      assert((last >> 32) == sscreen->info.address32_hi);
+   }
+
+   pb_reference(&old_buf, NULL);
+
+   util_range_set_empty(&res->valid_buffer_range);
+   res->TC_L2_dirty = false;
+
+   /* Print debug information. */
+   if (sscreen->debug_flags & DBG(VM) && res->b.b.target == PIPE_BUFFER) {
+      fprintf(stderr, "VM start=0x%" PRIX64 "  end=0x%" PRIX64 " | Buffer %" PRIu64 " bytes\n",
+              res->gpu_address, res->gpu_address + res->buf->size, res->buf->size);
+   }
+
+   if (res->b.b.flags & SI_RESOURCE_FLAG_CLEAR)
+      si_screen_clear_buffer(sscreen, &res->b.b, 0, res->bo_size, 0);
+
+   return true;
  }
  
-static void si_buffer_destroy(struct pipe_screen *screen,
-                             struct pipe_resource *buf)
+static void si_buffer_destroy(struct pipe_screen *screen, struct pipe_resource *buf)
  {
-       struct si_resource *buffer = si_resource(buf);
+   struct si_resource *buffer = si_resource(buf);
  
-       threaded_resource_deinit(buf);
-       util_range_destroy(&buffer->valid_buffer_range);
-       pb_reference(&buffer->buf, NULL);
-       FREE(buffer);
+   threaded_resource_deinit(buf);
+   util_range_destroy(&buffer->valid_buffer_range);
+   pb_reference(&buffer->buf, NULL);
+   FREE(buffer);
  }
  
  /* Reallocate the buffer a update all resource bindings where the buffer is
@@ -266,560 +253,511 @@ static void si_buffer_destroy(struct pipe_screen *screen,
   * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
   * idle by discarding its contents.
   */
-static bool
-si_invalidate_buffer(struct si_context *sctx,
-                    struct si_resource *buf)
+static bool si_invalidate_buffer(struct si_context *sctx, struct si_resource *buf)
  {
-       /* Shared buffers can't be reallocated. */
-       if (buf->b.is_shared)
-               return false;
-
-       /* Sparse buffers can't be reallocated. */
-       if (buf->flags & RADEON_FLAG_SPARSE)
-               return false;
-
-       /* In AMD_pinned_memory, the user pointer association only gets
-        * broken when the buffer is explicitly re-allocated.
-        */
-       if (buf->b.is_user_ptr)
-               return false;
-
-       /* Check if mapping this buffer would cause waiting for the GPU. */
-       if (si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
-           !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
-               /* Reallocate the buffer in the same pipe_resource. */
-               si_alloc_resource(sctx->screen, buf);
-               si_rebind_buffer(sctx, &buf->b.b);
-       } else {
-               util_range_set_empty(&buf->valid_buffer_range);
-       }
-
-       return true;
+   /* Shared buffers can't be reallocated. */
+   if (buf->b.is_shared)
+      return false;
+
+   /* Sparse buffers can't be reallocated. */
+   if (buf->flags & RADEON_FLAG_SPARSE)
+      return false;
+
+   /* In AMD_pinned_memory, the user pointer association only gets
+    * broken when the buffer is explicitly re-allocated.
+    */
+   if (buf->b.is_user_ptr)
+      return false;
+
+   /* Check if mapping this buffer would cause waiting for the GPU. */
+   if (si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
+       !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
+      /* Reallocate the buffer in the same pipe_resource. */
+      si_alloc_resource(sctx->screen, buf);
+      si_rebind_buffer(sctx, &buf->b.b);
+   } else {
+      util_range_set_empty(&buf->valid_buffer_range);
+   }
+
+   return true;
  }
  
  /* Replace the storage of dst with src. */
-void si_replace_buffer_storage(struct pipe_context *ctx,
-                                struct pipe_resource *dst,
-                                struct pipe_resource *src)
+void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst,
+                               struct pipe_resource *src)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_resource *sdst = si_resource(dst);
-       struct si_resource *ssrc = si_resource(src);
-
-       pb_reference(&sdst->buf, ssrc->buf);
-       sdst->gpu_address = ssrc->gpu_address;
-       sdst->b.b.bind = ssrc->b.b.bind;
-       sdst->b.max_forced_staging_uploads = ssrc->b.max_forced_staging_uploads;
-       sdst->max_forced_staging_uploads = ssrc->max_forced_staging_uploads;
-       sdst->flags = ssrc->flags;
-
-       assert(sdst->vram_usage == ssrc->vram_usage);
-       assert(sdst->gart_usage == ssrc->gart_usage);
-       assert(sdst->bo_size == ssrc->bo_size);
-       assert(sdst->bo_alignment == ssrc->bo_alignment);
-       assert(sdst->domains == ssrc->domains);
-
-       si_rebind_buffer(sctx, dst);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_resource *sdst = si_resource(dst);
+   struct si_resource *ssrc = si_resource(src);
+
+   pb_reference(&sdst->buf, ssrc->buf);
+   sdst->gpu_address = ssrc->gpu_address;
+   sdst->b.b.bind = ssrc->b.b.bind;
+   sdst->b.max_forced_staging_uploads = ssrc->b.max_forced_staging_uploads;
+   sdst->max_forced_staging_uploads = ssrc->max_forced_staging_uploads;
+   sdst->flags = ssrc->flags;
+
+   assert(sdst->vram_usage == ssrc->vram_usage);
+   assert(sdst->gart_usage == ssrc->gart_usage);
+   assert(sdst->bo_size == ssrc->bo_size);
+   assert(sdst->bo_alignment == ssrc->bo_alignment);
+   assert(sdst->domains == ssrc->domains);
+
+   si_rebind_buffer(sctx, dst);
  }
  
-static void si_invalidate_resource(struct pipe_context *ctx,
-                                  struct pipe_resource *resource)
+static void si_invalidate_resource(struct pipe_context *ctx, struct pipe_resource *resource)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_resource *buf = si_resource(resource);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_resource *buf = si_resource(resource);
  
-       /* We currently only do anyting here for buffers */
-       if (resource->target == PIPE_BUFFER)
-               (void)si_invalidate_buffer(sctx, buf);
+   /* We currently only do anyting here for buffers */
+   if (resource->target == PIPE_BUFFER)
+      (void)si_invalidate_buffer(sctx, buf);
  }
  
-static void *si_buffer_get_transfer(struct pipe_context *ctx,
-                                   struct pipe_resource *resource,
-                                   unsigned usage,
-                                   const struct pipe_box *box,
-                                   struct pipe_transfer **ptransfer,
-                                   void *data, struct si_resource *staging,
-                                   unsigned offset)
+static void *si_buffer_get_transfer(struct pipe_context *ctx, struct pipe_resource *resource,
+                                    unsigned usage, const struct pipe_box *box,
+                                    struct pipe_transfer **ptransfer, void *data,
+                                    struct si_resource *staging, unsigned offset)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_transfer *transfer;
-
-       if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
-               transfer = slab_alloc(&sctx->pool_transfers_unsync);
-       else
-               transfer = slab_alloc(&sctx->pool_transfers);
-
-       transfer->b.b.resource = NULL;
-       pipe_resource_reference(&transfer->b.b.resource, resource);
-       transfer->b.b.level = 0;
-       transfer->b.b.usage = usage;
-       transfer->b.b.box = *box;
-       transfer->b.b.stride = 0;
-       transfer->b.b.layer_stride = 0;
-       transfer->b.staging = NULL;
-       transfer->offset = offset;
-       transfer->staging = staging;
-       *ptransfer = &transfer->b.b;
-       return data;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_transfer *transfer;
+
+   if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
+      transfer = slab_alloc(&sctx->pool_transfers_unsync);
+   else
+      transfer = slab_alloc(&sctx->pool_transfers);
+
+   transfer->b.b.resource = NULL;
+   pipe_resource_reference(&transfer->b.b.resource, resource);
+   transfer->b.b.level = 0;
+   transfer->b.b.usage = usage;
+   transfer->b.b.box = *box;
+   transfer->b.b.stride = 0;
+   transfer->b.b.layer_stride = 0;
+   transfer->b.staging = NULL;
+   transfer->offset = offset;
+   transfer->staging = staging;
+   *ptransfer = &transfer->b.b;
+   return data;
  }
  
-static void *si_buffer_transfer_map(struct pipe_context *ctx,
-                                   struct pipe_resource *resource,
-                                   unsigned level,
-                                   unsigned usage,
-                                   const struct pipe_box *box,
-                                   struct pipe_transfer **ptransfer)
+static void *si_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resource *resource,
+                                    unsigned level, unsigned usage, const struct pipe_box *box,
+                                    struct pipe_transfer **ptransfer)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_resource *buf = si_resource(resource);
-       uint8_t *data;
-
-       assert(box->x + box->width <= resource->width0);
-
-       /* From GL_AMD_pinned_memory issues:
-        *
-        *     4) Is glMapBuffer on a shared buffer guaranteed to return the
-        *        same system address which was specified at creation time?
-        *
-        *        RESOLVED: NO. The GL implementation might return a different
-        *        virtual mapping of that memory, although the same physical
-        *        page will be used.
-        *
-        * So don't ever use staging buffers.
-        */
-       if (buf->b.is_user_ptr)
-               usage |= PIPE_TRANSFER_PERSISTENT;
-
-       /* See if the buffer range being mapped has never been initialized,
-        * in which case it can be mapped unsynchronized. */
-       if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
-                      TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED)) &&
-           usage & PIPE_TRANSFER_WRITE &&
-           !buf->b.is_shared &&
-           !util_ranges_intersect(&buf->valid_buffer_range, box->x, box->x + box->width)) {
-               usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
-       }
-
-       /* If discarding the entire range, discard the whole resource instead. */
-       if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
-           box->x == 0 && box->width == resource->width0) {
-               usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
-       }
-
-       /* If a buffer in VRAM is too large and the range is discarded, don't
-        * map it directly. This makes sure that the buffer stays in VRAM.
-        */
-       bool force_discard_range = false;
-       if (usage & (PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
-                    PIPE_TRANSFER_DISCARD_RANGE) &&
-           !(usage & PIPE_TRANSFER_PERSISTENT) &&
-           /* Try not to decrement the counter if it's not positive. Still racy,
-            * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
-           buf->max_forced_staging_uploads > 0 &&
-           p_atomic_dec_return(&buf->max_forced_staging_uploads) >= 0) {
-               usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
-                          PIPE_TRANSFER_UNSYNCHRONIZED);
-               usage |= PIPE_TRANSFER_DISCARD_RANGE;
-               force_discard_range = true;
-       }
-
-       if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
-           !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
-                      TC_TRANSFER_MAP_NO_INVALIDATE))) {
-               assert(usage & PIPE_TRANSFER_WRITE);
-
-               if (si_invalidate_buffer(sctx, buf)) {
-                       /* At this point, the buffer is always idle. */
-                       usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
-               } else {
-                       /* Fall back to a temporary buffer. */
-                       usage |= PIPE_TRANSFER_DISCARD_RANGE;
-               }
-       }
-
-       if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT &&
-           buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
-               usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED |
-                          PIPE_TRANSFER_PERSISTENT);
-               usage |= PIPE_TRANSFER_DISCARD_RANGE;
-               force_discard_range = true;
-       }
-
-       if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
-           ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
-                        PIPE_TRANSFER_PERSISTENT))) ||
-            (buf->flags & RADEON_FLAG_SPARSE))) {
-               assert(usage & PIPE_TRANSFER_WRITE);
-
-               /* Check if mapping this buffer would cause waiting for the GPU.
-                */
-               if (buf->flags & RADEON_FLAG_SPARSE ||
-                   force_discard_range ||
-                   si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
-                   !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
-                       /* Do a wait-free write-only transfer using a temporary buffer. */
-                       struct u_upload_mgr *uploader;
-                       struct si_resource *staging = NULL;
-                       unsigned offset;
-
-                       /* If we are not called from the driver thread, we have
-                        * to use the uploader from u_threaded_context, which is
-                        * local to the calling thread.
-                        */
-                       if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
-                               uploader = sctx->tc->base.stream_uploader;
-                       else
-                               uploader = sctx->b.stream_uploader;
-
-                       u_upload_alloc(uploader, 0,
-                                       box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT),
-                                      sctx->screen->info.tcc_cache_line_size,
-                                      &offset, (struct pipe_resource**)&staging,
-                                       (void**)&data);
-
-                       if (staging) {
-                               data += box->x % SI_MAP_BUFFER_ALIGNMENT;
-                               return si_buffer_get_transfer(ctx, resource, usage, box,
-                                                               ptransfer, data, staging, offset);
-                       } else if (buf->flags & RADEON_FLAG_SPARSE) {
-                               return NULL;
-                       }
-               } else {
-                       /* At this point, the buffer is always idle (we checked it above). */
-                       usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
-               }
-       }
-       /* Use a staging buffer in cached GTT for reads. */
-       else if (((usage & PIPE_TRANSFER_READ) &&
-                 !(usage & PIPE_TRANSFER_PERSISTENT) &&
-                 (buf->domains & RADEON_DOMAIN_VRAM ||
-                  buf->flags & RADEON_FLAG_GTT_WC)) ||
-                (buf->flags & RADEON_FLAG_SPARSE)) {
-               struct si_resource *staging;
-
-               assert(!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC));
-               staging = si_resource(pipe_buffer_create(
-                               ctx->screen, 0, PIPE_USAGE_STAGING,
-                               box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT)));
-               if (staging) {
-                       /* Copy the VRAM buffer to the staging buffer. */
-                       si_sdma_copy_buffer(sctx, &staging->b.b, resource,
-                                           box->x % SI_MAP_BUFFER_ALIGNMENT,
-                                           box->x, box->width);
-
-                       data = si_buffer_map_sync_with_rings(sctx, staging,
-                                                            usage & ~PIPE_TRANSFER_UNSYNCHRONIZED);
-                       if (!data) {
-                               si_resource_reference(&staging, NULL);
-                               return NULL;
-                       }
-                       data += box->x % SI_MAP_BUFFER_ALIGNMENT;
-
-                       return si_buffer_get_transfer(ctx, resource, usage, box,
-                                                       ptransfer, data, staging, 0);
-               } else if (buf->flags & RADEON_FLAG_SPARSE) {
-                       return NULL;
-               }
-       }
-
-       data = si_buffer_map_sync_with_rings(sctx, buf, usage);
-       if (!data) {
-               return NULL;
-       }
-       data += box->x;
-
-       return si_buffer_get_transfer(ctx, resource, usage, box,
-                                       ptransfer, data, NULL, 0);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_resource *buf = si_resource(resource);
+   uint8_t *data;
+
+   assert(box->x + box->width <= resource->width0);
+
+   /* From GL_AMD_pinned_memory issues:
+    *
+    *     4) Is glMapBuffer on a shared buffer guaranteed to return the
+    *        same system address which was specified at creation time?
+    *
+    *        RESOLVED: NO. The GL implementation might return a different
+    *        virtual mapping of that memory, although the same physical
+    *        page will be used.
+    *
+    * So don't ever use staging buffers.
+    */
+   if (buf->b.is_user_ptr)
+      usage |= PIPE_TRANSFER_PERSISTENT;
+
+   /* See if the buffer range being mapped has never been initialized,
+    * in which case it can be mapped unsynchronized. */
+   if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED)) &&
+       usage & PIPE_TRANSFER_WRITE && !buf->b.is_shared &&
+       !util_ranges_intersect(&buf->valid_buffer_range, box->x, box->x + box->width)) {
+      usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+   }
+
+   /* If discarding the entire range, discard the whole resource instead. */
+   if (usage & PIPE_TRANSFER_DISCARD_RANGE && box->x == 0 && box->width == resource->width0) {
+      usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
+   }
+
+   /* If a buffer in VRAM is too large and the range is discarded, don't
+    * map it directly. This makes sure that the buffer stays in VRAM.
+    */
+   bool force_discard_range = false;
+   if (usage & (PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE | PIPE_TRANSFER_DISCARD_RANGE) &&
+       !(usage & PIPE_TRANSFER_PERSISTENT) &&
+       /* Try not to decrement the counter if it's not positive. Still racy,
+        * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
+       buf->max_forced_staging_uploads > 0 &&
+       p_atomic_dec_return(&buf->max_forced_staging_uploads) >= 0) {
+      usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE | PIPE_TRANSFER_UNSYNCHRONIZED);
+      usage |= PIPE_TRANSFER_DISCARD_RANGE;
+      force_discard_range = true;
+   }
+
+   if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
+       !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | TC_TRANSFER_MAP_NO_INVALIDATE))) {
+      assert(usage & PIPE_TRANSFER_WRITE);
+
+      if (si_invalidate_buffer(sctx, buf)) {
+         /* At this point, the buffer is always idle. */
+         usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+      } else {
+         /* Fall back to a temporary buffer. */
+         usage |= PIPE_TRANSFER_DISCARD_RANGE;
+      }
+   }
+
+   if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT &&
+       buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
+      usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_PERSISTENT);
+      usage |= PIPE_TRANSFER_DISCARD_RANGE;
+      force_discard_range = true;
+   }
+
+   if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
+       ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_PERSISTENT))) ||
+        (buf->flags & RADEON_FLAG_SPARSE))) {
+      assert(usage & PIPE_TRANSFER_WRITE);
+
+      /* Check if mapping this buffer would cause waiting for the GPU.
+       */
+      if (buf->flags & RADEON_FLAG_SPARSE || force_discard_range ||
+          si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
+          !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
+         /* Do a wait-free write-only transfer using a temporary buffer. */
+         struct u_upload_mgr *uploader;
+         struct si_resource *staging = NULL;
+         unsigned offset;
+
+         /* If we are not called from the driver thread, we have
+          * to use the uploader from u_threaded_context, which is
+          * local to the calling thread.
+          */
+         if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
+            uploader = sctx->tc->base.stream_uploader;
+         else
+            uploader = sctx->b.stream_uploader;
+
+         u_upload_alloc(uploader, 0, box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT),
+                        sctx->screen->info.tcc_cache_line_size, &offset,
+                        (struct pipe_resource **)&staging, (void **)&data);
+
+         if (staging) {
+            data += box->x % SI_MAP_BUFFER_ALIGNMENT;
+            return si_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, staging,
+                                          offset);
+         } else if (buf->flags & RADEON_FLAG_SPARSE) {
+            return NULL;
+         }
+      } else {
+         /* At this point, the buffer is always idle (we checked it above). */
+         usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+      }
+   }
+   /* Use a staging buffer in cached GTT for reads. */
+   else if (((usage & PIPE_TRANSFER_READ) && !(usage & PIPE_TRANSFER_PERSISTENT) &&
+             (buf->domains & RADEON_DOMAIN_VRAM || buf->flags & RADEON_FLAG_GTT_WC)) ||
+            (buf->flags & RADEON_FLAG_SPARSE)) {
+      struct si_resource *staging;
+
+      assert(!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC));
+      staging = si_resource(pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_STAGING,
+                                               box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT)));
+      if (staging) {
+         /* Copy the VRAM buffer to the staging buffer. */
+         si_sdma_copy_buffer(sctx, &staging->b.b, resource, box->x % SI_MAP_BUFFER_ALIGNMENT,
+                             box->x, box->width);
+
+         data = si_buffer_map_sync_with_rings(sctx, staging, usage & ~PIPE_TRANSFER_UNSYNCHRONIZED);
+         if (!data) {
+            si_resource_reference(&staging, NULL);
+            return NULL;
+         }
+         data += box->x % SI_MAP_BUFFER_ALIGNMENT;
+
+         return si_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, staging, 0);
+      } else if (buf->flags & RADEON_FLAG_SPARSE) {
+         return NULL;
+      }
+   }
+
+   data = si_buffer_map_sync_with_rings(sctx, buf, usage);
+   if (!data) {
+      return NULL;
+   }
+   data += box->x;
+
+   return si_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, NULL, 0);
  }
  
-static void si_buffer_do_flush_region(struct pipe_context *ctx,
-                                     struct pipe_transfer *transfer,
-                                     const struct pipe_box *box)
+static void si_buffer_do_flush_region(struct pipe_context *ctx, struct pipe_transfer *transfer,
+                                      const struct pipe_box *box)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_transfer *stransfer = (struct si_transfer*)transfer;
-       struct si_resource *buf = si_resource(transfer->resource);
-
-       if (stransfer->staging) {
-               unsigned src_offset = stransfer->offset +
-                                     transfer->box.x % SI_MAP_BUFFER_ALIGNMENT +
-                                     (box->x - transfer->box.x);
-
-               if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
-                       /* This should be true for all uploaders. */
-                       assert(transfer->box.x == 0);
-
-                       /* Find a previous upload and extend its range. The last
-                        * upload is likely to be at the end of the list.
-                        */
-                       for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) {
-                               struct si_sdma_upload *up = &sctx->sdma_uploads[i];
-
-                               if (up->dst != buf)
-                                       continue;
-
-                               assert(up->src == stransfer->staging);
-                               assert(box->x > up->dst_offset);
-                               up->size = box->x + box->width - up->dst_offset;
-                               return;
-                       }
-
-                       /* Enlarge the array if it's full. */
-                       if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) {
-                               unsigned size;
-
-                               sctx->max_sdma_uploads += 4;
-                               size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]);
-                               sctx->sdma_uploads = realloc(sctx->sdma_uploads, size);
-                       }
-
-                       /* Add a new upload. */
-                       struct si_sdma_upload *up =
-                               &sctx->sdma_uploads[sctx->num_sdma_uploads++];
-                       up->dst = up->src = NULL;
-                       si_resource_reference(&up->dst, buf);
-                       si_resource_reference(&up->src, stransfer->staging);
-                       up->dst_offset = box->x;
-                       up->src_offset = src_offset;
-                       up->size = box->width;
-                       return;
-               }
-
-               /* Copy the staging buffer into the original one. */
-               si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b,
-                              box->x, src_offset, box->width);
-       }
-
-       util_range_add(&buf->b.b, &buf->valid_buffer_range, box->x,
-                      box->x + box->width);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_transfer *stransfer = (struct si_transfer *)transfer;
+   struct si_resource *buf = si_resource(transfer->resource);
+
+   if (stransfer->staging) {
+      unsigned src_offset =
+         stransfer->offset + transfer->box.x % SI_MAP_BUFFER_ALIGNMENT + (box->x - transfer->box.x);
+
+      if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
+         /* This should be true for all uploaders. */
+         assert(transfer->box.x == 0);
+
+         /* Find a previous upload and extend its range. The last
+          * upload is likely to be at the end of the list.
+          */
+         for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) {
+            struct si_sdma_upload *up = &sctx->sdma_uploads[i];
+
+            if (up->dst != buf)
+               continue;
+
+            assert(up->src == stransfer->staging);
+            assert(box->x > up->dst_offset);
+            up->size = box->x + box->width - up->dst_offset;
+            return;
+         }
+
+         /* Enlarge the array if it's full. */
+         if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) {
+            unsigned size;
+
+            sctx->max_sdma_uploads += 4;
+            size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]);
+            sctx->sdma_uploads = realloc(sctx->sdma_uploads, size);
+         }
+
+         /* Add a new upload. */
+         struct si_sdma_upload *up = &sctx->sdma_uploads[sctx->num_sdma_uploads++];
+         up->dst = up->src = NULL;
+         si_resource_reference(&up->dst, buf);
+         si_resource_reference(&up->src, stransfer->staging);
+         up->dst_offset = box->x;
+         up->src_offset = src_offset;
+         up->size = box->width;
+         return;
+      }
+
+      /* Copy the staging buffer into the original one. */
+      si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b, box->x, src_offset,
+                     box->width);
+   }
+
+   util_range_add(&buf->b.b, &buf->valid_buffer_range, box->x, box->x + box->width);
  }
  
-static void si_buffer_flush_region(struct pipe_context *ctx,
-                                  struct pipe_transfer *transfer,
-                                  const struct pipe_box *rel_box)
+static void si_buffer_flush_region(struct pipe_context *ctx, struct pipe_transfer *transfer,
+                                   const struct pipe_box *rel_box)
  {
-       unsigned required_usage = PIPE_TRANSFER_WRITE |
-                                 PIPE_TRANSFER_FLUSH_EXPLICIT;
+   unsigned required_usage = PIPE_TRANSFER_WRITE | PIPE_TRANSFER_FLUSH_EXPLICIT;
  
-       if ((transfer->usage & required_usage) == required_usage) {
-               struct pipe_box box;
+   if ((transfer->usage & required_usage) == required_usage) {
+      struct pipe_box box;
  
-               u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
-               si_buffer_do_flush_region(ctx, transfer, &box);
-       }
+      u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
+      si_buffer_do_flush_region(ctx, transfer, &box);
+   }
  }
  
-static void si_buffer_transfer_unmap(struct pipe_context *ctx,
-                                    struct pipe_transfer *transfer)
+static void si_buffer_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer *transfer)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_transfer *stransfer = (struct si_transfer*)transfer;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_transfer *stransfer = (struct si_transfer *)transfer;
  
-       if (transfer->usage & PIPE_TRANSFER_WRITE &&
-           !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
-               si_buffer_do_flush_region(ctx, transfer, &transfer->box);
+   if (transfer->usage & PIPE_TRANSFER_WRITE && !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
+      si_buffer_do_flush_region(ctx, transfer, &transfer->box);
  
-       si_resource_reference(&stransfer->staging, NULL);
-       assert(stransfer->b.staging == NULL); /* for threaded context only */
-       pipe_resource_reference(&transfer->resource, NULL);
+   si_resource_reference(&stransfer->staging, NULL);
+   assert(stransfer->b.staging == NULL); /* for threaded context only */
+   pipe_resource_reference(&transfer->resource, NULL);
  
-       /* Don't use pool_transfers_unsync. We are always in the driver
-        * thread. */
-       slab_free(&sctx->pool_transfers, transfer);
+   /* Don't use pool_transfers_unsync. We are always in the driver
+    * thread. */
+   slab_free(&sctx->pool_transfers, transfer);
  }
  
-static void si_buffer_subdata(struct pipe_context *ctx,
-                             struct pipe_resource *buffer,
-                             unsigned usage, unsigned offset,
-                             unsigned size, const void *data)
+static void si_buffer_subdata(struct pipe_context *ctx, struct pipe_resource *buffer,
+                              unsigned usage, unsigned offset, unsigned size, const void *data)
  {
-       struct pipe_transfer *transfer = NULL;
-       struct pipe_box box;
-       uint8_t *map = NULL;
+   struct pipe_transfer *transfer = NULL;
+   struct pipe_box box;
+   uint8_t *map = NULL;
  
-       usage |= PIPE_TRANSFER_WRITE;
+   usage |= PIPE_TRANSFER_WRITE;
  
-       if (!(usage & PIPE_TRANSFER_MAP_DIRECTLY))
-               usage |= PIPE_TRANSFER_DISCARD_RANGE;
+   if (!(usage & PIPE_TRANSFER_MAP_DIRECTLY))
+      usage |= PIPE_TRANSFER_DISCARD_RANGE;
  
-       u_box_1d(offset, size, &box);
-       map = si_buffer_transfer_map(ctx, buffer, 0, usage, &box, &transfer);
-       if (!map)
-               return;
+   u_box_1d(offset, size, &box);
+   map = si_buffer_transfer_map(ctx, buffer, 0, usage, &box, &transfer);
+   if (!map)
+      return;
  
-       memcpy(map, data, size);
-       si_buffer_transfer_unmap(ctx, transfer);
+   memcpy(map, data, size);
+   si_buffer_transfer_unmap(ctx, transfer);
  }
  
-static const struct u_resource_vtbl si_buffer_vtbl =
-{
-       NULL,                           /* get_handle */
-       si_buffer_destroy,              /* resource_destroy */
-       si_buffer_transfer_map, /* transfer_map */
-       si_buffer_flush_region, /* transfer_flush_region */
-       si_buffer_transfer_unmap,       /* transfer_unmap */
+static const struct u_resource_vtbl si_buffer_vtbl = {
+   NULL,                     /* get_handle */
+   si_buffer_destroy,        /* resource_destroy */
+   si_buffer_transfer_map,   /* transfer_map */
+   si_buffer_flush_region,   /* transfer_flush_region */
+   si_buffer_transfer_unmap, /* transfer_unmap */
  };
  
-static struct si_resource *
-si_alloc_buffer_struct(struct pipe_screen *screen,
-                      const struct pipe_resource *templ)
+static struct si_resource *si_alloc_buffer_struct(struct pipe_screen *screen,
+                                                  const struct pipe_resource *templ)
  {
-       struct si_resource *buf;
+   struct si_resource *buf;
  
-       buf = MALLOC_STRUCT(si_resource);
+   buf = MALLOC_STRUCT(si_resource);
  
-       buf->b.b = *templ;
-       buf->b.b.next = NULL;
-       pipe_reference_init(&buf->b.b.reference, 1);
-       buf->b.b.screen = screen;
+   buf->b.b = *templ;
+   buf->b.b.next = NULL;
+   pipe_reference_init(&buf->b.b.reference, 1);
+   buf->b.b.screen = screen;
  
-       buf->b.vtbl = &si_buffer_vtbl;
-       threaded_resource_init(&buf->b.b);
+   buf->b.vtbl = &si_buffer_vtbl;
+   threaded_resource_init(&buf->b.b);
  
-       buf->buf = NULL;
-       buf->bind_history = 0;
-       buf->TC_L2_dirty = false;
-       util_range_init(&buf->valid_buffer_range);
-       return buf;
+   buf->buf = NULL;
+   buf->bind_history = 0;
+   buf->TC_L2_dirty = false;
+   util_range_init(&buf->valid_buffer_range);
+   return buf;
  }
  
  static struct pipe_resource *si_buffer_create(struct pipe_screen *screen,
-                                             const struct pipe_resource *templ,
-                                             unsigned alignment)
+                                              const struct pipe_resource *templ, unsigned alignment)
  {
-       struct si_screen *sscreen = (struct si_screen*)screen;
-       struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
  
-       if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
-               buf->b.b.flags |= SI_RESOURCE_FLAG_UNMAPPABLE;
+   if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
+      buf->b.b.flags |= SI_RESOURCE_FLAG_UNMAPPABLE;
  
-       si_init_resource_fields(sscreen, buf, templ->width0, alignment);
+   si_init_resource_fields(sscreen, buf, templ->width0, alignment);
  
-       if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
-               buf->flags |= RADEON_FLAG_SPARSE;
+   if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
+      buf->flags |= RADEON_FLAG_SPARSE;
  
-       if (!si_alloc_resource(sscreen, buf)) {
-               FREE(buf);
-               return NULL;
-       }
-       return &buf->b.b;
+   if (!si_alloc_resource(sscreen, buf)) {
+      FREE(buf);
+      return NULL;
+   }
+   return &buf->b.b;
  }
  
-struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen,
-                                                unsigned flags, unsigned usage,
-                                                unsigned size, unsigned alignment)
+struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
+                                                 unsigned usage, unsigned size, unsigned alignment)
  {
-       struct pipe_resource buffer;
-
-       memset(&buffer, 0, sizeof buffer);
-       buffer.target = PIPE_BUFFER;
-       buffer.format = PIPE_FORMAT_R8_UNORM;
-       buffer.bind = 0;
-       buffer.usage = usage;
-       buffer.flags = flags;
-       buffer.width0 = size;
-       buffer.height0 = 1;
-       buffer.depth0 = 1;
-       buffer.array_size = 1;
-       return si_buffer_create(screen, &buffer, alignment);
+   struct pipe_resource buffer;
+
+   memset(&buffer, 0, sizeof buffer);
+   buffer.target = PIPE_BUFFER;
+   buffer.format = PIPE_FORMAT_R8_UNORM;
+   buffer.bind = 0;
+   buffer.usage = usage;
+   buffer.flags = flags;
+   buffer.width0 = size;
+   buffer.height0 = 1;
+   buffer.depth0 = 1;
+   buffer.array_size = 1;
+   return si_buffer_create(screen, &buffer, alignment);
  }
  
-struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen,
-                                              unsigned flags, unsigned usage,
-                                              unsigned size, unsigned alignment)
+struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
+                                             unsigned usage, unsigned size, unsigned alignment)
  {
-       return si_resource(pipe_aligned_buffer_create(screen, flags, usage,
-                                                       size, alignment));
+   return si_resource(pipe_aligned_buffer_create(screen, flags, usage, size, alignment));
  }
  
-static struct pipe_resource *
-si_buffer_from_user_memory(struct pipe_screen *screen,
-                          const struct pipe_resource *templ,
-                          void *user_memory)
+static struct pipe_resource *si_buffer_from_user_memory(struct pipe_screen *screen,
+                                                        const struct pipe_resource *templ,
+                                                        void *user_memory)
  {
-       struct si_screen *sscreen = (struct si_screen*)screen;
-       struct radeon_winsys *ws = sscreen->ws;
-       struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
-
-       buf->domains = RADEON_DOMAIN_GTT;
-       buf->flags = 0;
-       buf->b.is_user_ptr = true;
-       util_range_add(&buf->b.b, &buf->valid_buffer_range, 0, templ->width0);
-       util_range_add(&buf->b.b, &buf->b.valid_buffer_range, 0, templ->width0);
-
-       /* Convert a user pointer to a buffer. */
-       buf->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0);
-       if (!buf->buf) {
-               FREE(buf);
-               return NULL;
-       }
-
-       buf->gpu_address = ws->buffer_get_virtual_address(buf->buf);
-       buf->vram_usage = 0;
-       buf->gart_usage = templ->width0;
-
-       return &buf->b.b;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct radeon_winsys *ws = sscreen->ws;
+   struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
+
+   buf->domains = RADEON_DOMAIN_GTT;
+   buf->flags = 0;
+   buf->b.is_user_ptr = true;
+   util_range_add(&buf->b.b, &buf->valid_buffer_range, 0, templ->width0);
+   util_range_add(&buf->b.b, &buf->b.valid_buffer_range, 0, templ->width0);
+
+   /* Convert a user pointer to a buffer. */
+   buf->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0);
+   if (!buf->buf) {
+      FREE(buf);
+      return NULL;
+   }
+
+   buf->gpu_address = ws->buffer_get_virtual_address(buf->buf);
+   buf->vram_usage = 0;
+   buf->gart_usage = templ->width0;
+
+   return &buf->b.b;
  }
  
  static struct pipe_resource *si_resource_create(struct pipe_screen *screen,
-                                               const struct pipe_resource *templ)
+                                                const struct pipe_resource *templ)
  {
-       if (templ->target == PIPE_BUFFER) {
-               return si_buffer_create(screen, templ, 256);
-       } else {
-               return si_texture_create(screen, templ);
-       }
+   if (templ->target == PIPE_BUFFER) {
+      return si_buffer_create(screen, templ, 256);
+   } else {
+      return si_texture_create(screen, templ);
+   }
  }
  
-static bool si_resource_commit(struct pipe_context *pctx,
-                              struct pipe_resource *resource,
-                              unsigned level, struct pipe_box *box,
-                              bool commit)
+static bool si_resource_commit(struct pipe_context *pctx, struct pipe_resource *resource,
+                               unsigned level, struct pipe_box *box, bool commit)
  {
-       struct si_context *ctx = (struct si_context *)pctx;
-       struct si_resource *res = si_resource(resource);
-
-       /*
-        * Since buffer commitment changes cannot be pipelined, we need to
-        * (a) flush any pending commands that refer to the buffer we're about
-        *     to change, and
-        * (b) wait for threaded submit to finish, including those that were
-        *     triggered by some other, earlier operation.
-        */
-       if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
-           ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs,
-                                              res->buf, RADEON_USAGE_READWRITE)) {
-               si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-       }
-       if (radeon_emitted(ctx->sdma_cs, 0) &&
-           ctx->ws->cs_is_buffer_referenced(ctx->sdma_cs,
-                                              res->buf, RADEON_USAGE_READWRITE)) {
-               si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
-       }
-
-       if (ctx->sdma_cs)
-               ctx->ws->cs_sync_flush(ctx->sdma_cs);
-       ctx->ws->cs_sync_flush(ctx->gfx_cs);
-
-       assert(resource->target == PIPE_BUFFER);
-
-       return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
+   struct si_context *ctx = (struct si_context *)pctx;
+   struct si_resource *res = si_resource(resource);
+
+   /*
+    * Since buffer commitment changes cannot be pipelined, we need to
+    * (a) flush any pending commands that refer to the buffer we're about
+    *     to change, and
+    * (b) wait for threaded submit to finish, including those that were
+    *     triggered by some other, earlier operation.
+    */
+   if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
+       ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, res->buf, RADEON_USAGE_READWRITE)) {
+      si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+   }
+   if (radeon_emitted(ctx->sdma_cs, 0) &&
+       ctx->ws->cs_is_buffer_referenced(ctx->sdma_cs, res->buf, RADEON_USAGE_READWRITE)) {
+      si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+   }
+
+   if (ctx->sdma_cs)
+      ctx->ws->cs_sync_flush(ctx->sdma_cs);
+   ctx->ws->cs_sync_flush(ctx->gfx_cs);
+
+   assert(resource->target == PIPE_BUFFER);
+
+   return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
  }
  
  void si_init_screen_buffer_functions(struct si_screen *sscreen)
  {
-       sscreen->b.resource_create = si_resource_create;
-       sscreen->b.resource_destroy = u_resource_destroy_vtbl;
-       sscreen->b.resource_from_user_memory = si_buffer_from_user_memory;
+   sscreen->b.resource_create = si_resource_create;
+   sscreen->b.resource_destroy = u_resource_destroy_vtbl;
+   sscreen->b.resource_from_user_memory = si_buffer_from_user_memory;
  }
  
  void si_init_buffer_functions(struct si_context *sctx)
  {
-       sctx->b.invalidate_resource = si_invalidate_resource;
-       sctx->b.transfer_map = u_transfer_map_vtbl;
-       sctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
-       sctx->b.transfer_unmap = u_transfer_unmap_vtbl;
-       sctx->b.texture_subdata = u_default_texture_subdata;
-       sctx->b.buffer_subdata = si_buffer_subdata;
-       sctx->b.resource_commit = si_resource_commit;
+   sctx->b.invalidate_resource = si_invalidate_resource;
+   sctx->b.transfer_map = u_transfer_map_vtbl;
+   sctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
+   sctx->b.transfer_unmap = u_transfer_unmap_vtbl;
+   sctx->b.texture_subdata = u_default_texture_subdata;
+   sctx->b.buffer_subdata = si_buffer_subdata;
+   sctx->b.resource_commit = si_resource_commit;
  }
diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h

index 0b0b64ca13ca842f5f9bfe832ddf0d44288691a8..8a9b6ea5e34568f16c915b911236e4b1fdaffcd6 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_build_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_build_pm4.h
@@ -34,131 +34,128 @@
  
  static inline void radeon_set_config_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
  {
-       assert(reg < SI_CONTEXT_REG_OFFSET);
-       assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
-       radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
-       radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
+   assert(reg < SI_CONTEXT_REG_OFFSET);
+   assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
+   radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
  }
  
  static inline void radeon_set_config_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
  {
-       radeon_set_config_reg_seq(cs, reg, 1);
-       radeon_emit(cs, value);
+   radeon_set_config_reg_seq(cs, reg, 1);
+   radeon_emit(cs, value);
  }
  
  static inline void radeon_set_context_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
  {
-       assert(reg >= SI_CONTEXT_REG_OFFSET);
-       assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
-       radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
-       radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
+   assert(reg >= SI_CONTEXT_REG_OFFSET);
+   assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
+   radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
  }
  
  static inline void radeon_set_context_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
  {
-       radeon_set_context_reg_seq(cs, reg, 1);
-       radeon_emit(cs, value);
+   radeon_set_context_reg_seq(cs, reg, 1);
+   radeon_emit(cs, value);
  }
  
-static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs,
-                                             unsigned reg, unsigned idx,
-                                             unsigned value)
+static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs, unsigned reg, unsigned idx,
+                                              unsigned value)
  {
-       assert(reg >= SI_CONTEXT_REG_OFFSET);
-       assert(cs->current.cdw + 3 <= cs->current.max_dw);
-       radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0));
-       radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28));
-       radeon_emit(cs, value);
+   assert(reg >= SI_CONTEXT_REG_OFFSET);
+   assert(cs->current.cdw + 3 <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0));
+   radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28));
+   radeon_emit(cs, value);
  }
  
  static inline void radeon_set_sh_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
  {
-       assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
-       assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
-       radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
-       radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
+   assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
+   assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
+   radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
  }
  
  static inline void radeon_set_sh_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
  {
-       radeon_set_sh_reg_seq(cs, reg, 1);
-       radeon_emit(cs, value);
+   radeon_set_sh_reg_seq(cs, reg, 1);
+   radeon_emit(cs, value);
  }
  
  static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
  {
-       assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
-       assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
-       radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
-       radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
+   assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
+   assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
+   radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
  }
  
  static inline void radeon_set_uconfig_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
  {
-       radeon_set_uconfig_reg_seq(cs, reg, 1);
-       radeon_emit(cs, value);
+   radeon_set_uconfig_reg_seq(cs, reg, 1);
+   radeon_emit(cs, value);
  }
  
-static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs,
-                                             struct si_screen *screen,
-                                             unsigned reg, unsigned idx,
-                                             unsigned value)
+static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs, struct si_screen *screen,
+                                              unsigned reg, unsigned idx, unsigned value)
  {
-       assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
-       assert(cs->current.cdw + 3 <= cs->current.max_dw);
-       assert(idx != 0);
-       unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
-       if (screen->info.chip_class < GFX9 ||
-           (screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26))
-               opcode = PKT3_SET_UCONFIG_REG;
-       radeon_emit(cs, PKT3(opcode, 1, 0));
-       radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28));
-       radeon_emit(cs, value);
+   assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
+   assert(cs->current.cdw + 3 <= cs->current.max_dw);
+   assert(idx != 0);
+   unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
+   if (screen->info.chip_class < GFX9 ||
+       (screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26))
+      opcode = PKT3_SET_UCONFIG_REG;
+   radeon_emit(cs, PKT3(opcode, 1, 0));
+   radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28));
+   radeon_emit(cs, value);
  }
  
  static inline void radeon_set_context_reg_rmw(struct radeon_cmdbuf *cs, unsigned reg,
-                                             unsigned value, unsigned mask)
+                                              unsigned value, unsigned mask)
  {
-       assert(reg >= SI_CONTEXT_REG_OFFSET);
-       assert(cs->current.cdw + 4 <= cs->current.max_dw);
-       radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0));
-       radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
-       radeon_emit(cs, mask);
-       radeon_emit(cs, value);
+   assert(reg >= SI_CONTEXT_REG_OFFSET);
+   assert(cs->current.cdw + 4 <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0));
+   radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
+   radeon_emit(cs, mask);
+   radeon_emit(cs, value);
  }
  
  /* Emit PKT3_CONTEXT_REG_RMW if the register value is different. */
  static inline void radeon_opt_set_context_reg_rmw(struct si_context *sctx, unsigned offset,
-                                                 enum si_tracked_reg reg, unsigned value,
-                                                 unsigned mask)
+                                                  enum si_tracked_reg reg, unsigned value,
+                                                  unsigned mask)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
  
-       assert((value & ~mask) == 0);
-       value &= mask;
+   assert((value & ~mask) == 0);
+   value &= mask;
  
-       if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
-           sctx->tracked_regs.reg_value[reg] != value) {
-               radeon_set_context_reg_rmw(cs, offset, value, mask);
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+       sctx->tracked_regs.reg_value[reg] != value) {
+      radeon_set_context_reg_rmw(cs, offset, value, mask);
  
-               sctx->tracked_regs.reg_saved |= 0x1ull << reg;
-               sctx->tracked_regs.reg_value[reg] = value;
-       }
+      sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+      sctx->tracked_regs.reg_value[reg] = value;
+   }
  }
  
  /* Emit PKT3_SET_CONTEXT_REG if the register value is different. */
  static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned offset,
-                                             enum si_tracked_reg reg, unsigned value)
+                                              enum si_tracked_reg reg, unsigned value)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
  
-       if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
-           sctx->tracked_regs.reg_value[reg] != value) {
-               radeon_set_context_reg(cs, offset, value);
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+       sctx->tracked_regs.reg_value[reg] != value) {
+      radeon_set_context_reg(cs, offset, value);
  
-               sctx->tracked_regs.reg_saved |= 0x1ull << reg;
-               sctx->tracked_regs.reg_value[reg] = value;
-       }
+      sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+      sctx->tracked_regs.reg_value[reg] = value;
+   }
  }
  
  /**
@@ -168,98 +165,96 @@ static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned
   * @param value2        is written to second register
   */
  static inline void radeon_opt_set_context_reg2(struct si_context *sctx, unsigned offset,
-                                              enum si_tracked_reg reg, unsigned value1,
-                                              unsigned value2)
+                                               enum si_tracked_reg reg, unsigned value1,
+                                               unsigned value2)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-       if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 ||
-           sctx->tracked_regs.reg_value[reg] != value1 ||
-           sctx->tracked_regs.reg_value[reg+1] != value2) {
-               radeon_set_context_reg_seq(cs, offset, 2);
-               radeon_emit(cs, value1);
-               radeon_emit(cs, value2);
-
-               sctx->tracked_regs.reg_value[reg] = value1;
-               sctx->tracked_regs.reg_value[reg+1] = value2;
-               sctx->tracked_regs.reg_saved |= 0x3ull << reg;
-       }
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 ||
+       sctx->tracked_regs.reg_value[reg] != value1 ||
+       sctx->tracked_regs.reg_value[reg + 1] != value2) {
+      radeon_set_context_reg_seq(cs, offset, 2);
+      radeon_emit(cs, value1);
+      radeon_emit(cs, value2);
+
+      sctx->tracked_regs.reg_value[reg] = value1;
+      sctx->tracked_regs.reg_value[reg + 1] = value2;
+      sctx->tracked_regs.reg_saved |= 0x3ull << reg;
+   }
  }
  
  /**
   * Set 3 consecutive registers if any registers value is different.
   */
  static inline void radeon_opt_set_context_reg3(struct si_context *sctx, unsigned offset,
-                                              enum si_tracked_reg reg, unsigned value1,
-                                              unsigned value2, unsigned value3)
+                                               enum si_tracked_reg reg, unsigned value1,
+                                               unsigned value2, unsigned value3)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-       if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 ||
-           sctx->tracked_regs.reg_value[reg] != value1 ||
-           sctx->tracked_regs.reg_value[reg+1] != value2 ||
-           sctx->tracked_regs.reg_value[reg+2] != value3) {
-               radeon_set_context_reg_seq(cs, offset, 3);
-               radeon_emit(cs, value1);
-               radeon_emit(cs, value2);
-               radeon_emit(cs, value3);
-
-               sctx->tracked_regs.reg_value[reg] = value1;
-               sctx->tracked_regs.reg_value[reg+1] = value2;
-               sctx->tracked_regs.reg_value[reg+2] = value3;
-               sctx->tracked_regs.reg_saved |= 0x7ull << reg;
-       }
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 ||
+       sctx->tracked_regs.reg_value[reg] != value1 ||
+       sctx->tracked_regs.reg_value[reg + 1] != value2 ||
+       sctx->tracked_regs.reg_value[reg + 2] != value3) {
+      radeon_set_context_reg_seq(cs, offset, 3);
+      radeon_emit(cs, value1);
+      radeon_emit(cs, value2);
+      radeon_emit(cs, value3);
+
+      sctx->tracked_regs.reg_value[reg] = value1;
+      sctx->tracked_regs.reg_value[reg + 1] = value2;
+      sctx->tracked_regs.reg_value[reg + 2] = value3;
+      sctx->tracked_regs.reg_saved |= 0x7ull << reg;
+   }
  }
  
  /**
   * Set 4 consecutive registers if any registers value is different.
   */
  static inline void radeon_opt_set_context_reg4(struct si_context *sctx, unsigned offset,
-                                              enum si_tracked_reg reg, unsigned value1,
-                                              unsigned value2, unsigned value3,
-                                              unsigned value4)
+                                               enum si_tracked_reg reg, unsigned value1,
+                                               unsigned value2, unsigned value3, unsigned value4)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-       if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf ||
-           sctx->tracked_regs.reg_value[reg] != value1 ||
-           sctx->tracked_regs.reg_value[reg+1] != value2 ||
-           sctx->tracked_regs.reg_value[reg+2] != value3 ||
-           sctx->tracked_regs.reg_value[reg+3] != value4) {
-               radeon_set_context_reg_seq(cs, offset, 4);
-               radeon_emit(cs, value1);
-               radeon_emit(cs, value2);
-               radeon_emit(cs, value3);
-               radeon_emit(cs, value4);
-
-               sctx->tracked_regs.reg_value[reg] = value1;
-               sctx->tracked_regs.reg_value[reg+1] = value2;
-               sctx->tracked_regs.reg_value[reg+2] = value3;
-               sctx->tracked_regs.reg_value[reg+3] = value4;
-               sctx->tracked_regs.reg_saved |= 0xfull << reg;
-       }
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf ||
+       sctx->tracked_regs.reg_value[reg] != value1 ||
+       sctx->tracked_regs.reg_value[reg + 1] != value2 ||
+       sctx->tracked_regs.reg_value[reg + 2] != value3 ||
+       sctx->tracked_regs.reg_value[reg + 3] != value4) {
+      radeon_set_context_reg_seq(cs, offset, 4);
+      radeon_emit(cs, value1);
+      radeon_emit(cs, value2);
+      radeon_emit(cs, value3);
+      radeon_emit(cs, value4);
+
+      sctx->tracked_regs.reg_value[reg] = value1;
+      sctx->tracked_regs.reg_value[reg + 1] = value2;
+      sctx->tracked_regs.reg_value[reg + 2] = value3;
+      sctx->tracked_regs.reg_value[reg + 3] = value4;
+      sctx->tracked_regs.reg_saved |= 0xfull << reg;
+   }
  }
  
  /**
   * Set consecutive registers if any registers value is different.
   */
  static inline void radeon_opt_set_context_regn(struct si_context *sctx, unsigned offset,
-                                              unsigned *value, unsigned *saved_val,
-                                              unsigned num)
+                                               unsigned *value, unsigned *saved_val, unsigned num)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       int i, j;
-
-       for (i = 0; i < num; i++) {
-               if (saved_val[i] != value[i]) {
-                       radeon_set_context_reg_seq(cs, offset, num);
-                       for (j = 0; j < num; j++)
-                               radeon_emit(cs, value[j]);
-
-                       memcpy(saved_val, value, sizeof(uint32_t) * num);
-                       break;
-               }
-       }
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   int i, j;
+
+   for (i = 0; i < num; i++) {
+      if (saved_val[i] != value[i]) {
+         radeon_set_context_reg_seq(cs, offset, num);
+         for (j = 0; j < num; j++)
+            radeon_emit(cs, value[j]);
+
+         memcpy(saved_val, value, sizeof(uint32_t) * num);
+         break;
+      }
+   }
  }
  
  #endif
diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c

index 2af778b41ad423ac8ff1937652b00bd21abc6242..1e7aa4432228b45336e10bdd125558545eda2b2d 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -24,761 +24,710 @@
  
  #include "si_pipe.h"
  #include "sid.h"
-
  #include "util/format/u_format.h"
  #include "util/u_pack_color.h"
  #include "util/u_surface.h"
  
-enum {
-       SI_CLEAR         = SI_SAVE_FRAGMENT_STATE,
-       SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE,
+enum
+{
+   SI_CLEAR = SI_SAVE_FRAGMENT_STATE,
+   SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE,
  };
  
-static void si_alloc_separate_cmask(struct si_screen *sscreen,
-                                   struct si_texture *tex)
+static void si_alloc_separate_cmask(struct si_screen *sscreen, struct si_texture *tex)
  {
-       /* CMASK for MSAA is allocated in advance or always disabled
-        * by "nofmask" option.
-        */
-       if (tex->cmask_buffer || !tex->surface.cmask_size ||
-           tex->buffer.b.b.nr_samples >= 2)
-                return;
-
-       tex->cmask_buffer =
-               si_aligned_buffer_create(&sscreen->b,
-                                        SI_RESOURCE_FLAG_UNMAPPABLE,
-                                        PIPE_USAGE_DEFAULT,
-                                        tex->surface.cmask_size,
-                                        tex->surface.cmask_alignment);
-       if (tex->cmask_buffer == NULL)
-               return;
-
-       tex->cmask_base_address_reg = tex->cmask_buffer->gpu_address >> 8;
-       tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
-
-       p_atomic_inc(&sscreen->compressed_colortex_counter);
+   /* CMASK for MSAA is allocated in advance or always disabled
+    * by "nofmask" option.
+    */
+   if (tex->cmask_buffer || !tex->surface.cmask_size || tex->buffer.b.b.nr_samples >= 2)
+      return;
+
+   tex->cmask_buffer =
+      si_aligned_buffer_create(&sscreen->b, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+                               tex->surface.cmask_size, tex->surface.cmask_alignment);
+   if (tex->cmask_buffer == NULL)
+      return;
+
+   tex->cmask_base_address_reg = tex->cmask_buffer->gpu_address >> 8;
+   tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
+
+   p_atomic_inc(&sscreen->compressed_colortex_counter);
  }
  
-static bool si_set_clear_color(struct si_texture *tex,
-                              enum pipe_format surface_format,
-                              const union pipe_color_union *color)
+static bool si_set_clear_color(struct si_texture *tex, enum pipe_format surface_format,
+                               const union pipe_color_union *color)
  {
-       union util_color uc;
-
-       memset(&uc, 0, sizeof(uc));
-
-       if (tex->surface.bpe == 16) {
-               /* DCC fast clear only:
-                *   CLEAR_WORD0 = R = G = B
-                *   CLEAR_WORD1 = A
-                */
-               assert(color->ui[0] == color->ui[1] &&
-                      color->ui[0] == color->ui[2]);
-               uc.ui[0] = color->ui[0];
-               uc.ui[1] = color->ui[3];
-       } else {
-               util_pack_color_union(surface_format, &uc, color);
-       }
-
-       if (memcmp(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)) == 0)
-               return false;
-
-       memcpy(tex->color_clear_value, &uc, 2 * sizeof(uint32_t));
-       return true;
+   union util_color uc;
+
+   memset(&uc, 0, sizeof(uc));
+
+   if (tex->surface.bpe == 16) {
+      /* DCC fast clear only:
+       *   CLEAR_WORD0 = R = G = B
+       *   CLEAR_WORD1 = A
+       */
+      assert(color->ui[0] == color->ui[1] && color->ui[0] == color->ui[2]);
+      uc.ui[0] = color->ui[0];
+      uc.ui[1] = color->ui[3];
+   } else {
+      util_pack_color_union(surface_format, &uc, color);
+   }
+
+   if (memcmp(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)) == 0)
+      return false;
+
+   memcpy(tex->color_clear_value, &uc, 2 * sizeof(uint32_t));
+   return true;
  }
  
  /** Linearize and convert luminace/intensity to red. */
  enum pipe_format si_simplify_cb_format(enum pipe_format format)
  {
-       format = util_format_linear(format);
-       format = util_format_luminance_to_red(format);
-       return util_format_intensity_to_red(format);
+   format = util_format_linear(format);
+   format = util_format_luminance_to_red(format);
+   return util_format_intensity_to_red(format);
  }
  
  bool vi_alpha_is_on_msb(struct si_screen *sscreen, enum pipe_format format)
  {
-       format = si_simplify_cb_format(format);
-       const struct util_format_description *desc = util_format_description(format);
+   format = si_simplify_cb_format(format);
+   const struct util_format_description *desc = util_format_description(format);
  
-       /* Formats with 3 channels can't have alpha. */
-       if (desc->nr_channels == 3)
-               return true; /* same as xxxA; is any value OK here? */
+   /* Formats with 3 channels can't have alpha. */
+   if (desc->nr_channels == 3)
+      return true; /* same as xxxA; is any value OK here? */
  
-       if (sscreen->info.chip_class >= GFX10 && desc->nr_channels == 1)
-               return desc->swizzle[3] == PIPE_SWIZZLE_X;
+   if (sscreen->info.chip_class >= GFX10 && desc->nr_channels == 1)
+      return desc->swizzle[3] == PIPE_SWIZZLE_X;
  
-       return si_translate_colorswap(format, false) <= 1;
+   return si_translate_colorswap(format, false) <= 1;
  }
  
-static bool vi_get_fast_clear_parameters(struct si_screen *sscreen,
-                                        enum pipe_format base_format,
-                                        enum pipe_format surface_format,
-                                        const union pipe_color_union *color,
-                                        uint32_t* clear_value,
-                                        bool *eliminate_needed)
+static bool vi_get_fast_clear_parameters(struct si_screen *sscreen, enum pipe_format base_format,
+                                         enum pipe_format surface_format,
+                                         const union pipe_color_union *color, uint32_t *clear_value,
+                                         bool *eliminate_needed)
  {
-       /* If we want to clear without needing a fast clear eliminate step, we
-        * can set color and alpha independently to 0 or 1 (or 0/max for integer
-        * formats).
-        */
-       bool values[4] = {}; /* whether to clear to 0 or 1 */
-       bool color_value = false; /* clear color to 0 or 1 */
-       bool alpha_value = false; /* clear alpha to 0 or 1 */
-       int alpha_channel; /* index of the alpha component */
-       bool has_color = false;
-       bool has_alpha = false;
-
-       const struct util_format_description *desc =
-               util_format_description(si_simplify_cb_format(surface_format));
-
-       /* 128-bit fast clear with different R,G,B values is unsupported. */
-       if (desc->block.bits == 128 &&
-           (color->ui[0] != color->ui[1] ||
-            color->ui[0] != color->ui[2]))
-               return false;
-
-       *eliminate_needed = true;
-       *clear_value = DCC_CLEAR_COLOR_REG;
-
-       if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
-               return true; /* need ELIMINATE_FAST_CLEAR */
-
-       bool base_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, base_format);
-       bool surf_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, surface_format);
-
-       /* Formats with 3 channels can't have alpha. */
-       if (desc->nr_channels == 3)
-               alpha_channel = -1;
-       else if (surf_alpha_is_on_msb)
-               alpha_channel = desc->nr_channels - 1;
-       else
-               alpha_channel = 0;
-
-       for (int i = 0; i < 4; ++i) {
-               if (desc->swizzle[i] >= PIPE_SWIZZLE_0)
-                       continue;
-
-               if (desc->channel[i].pure_integer &&
-                   desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
-                       /* Use the maximum value for clamping the clear color. */
-                       int max = u_bit_consecutive(0, desc->channel[i].size - 1);
-
-                       values[i] = color->i[i] != 0;
-                       if (color->i[i] != 0 && MIN2(color->i[i], max) != max)
-                               return true; /* need ELIMINATE_FAST_CLEAR */
-               } else if (desc->channel[i].pure_integer &&
-                          desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
-                       /* Use the maximum value for clamping the clear color. */
-                       unsigned max = u_bit_consecutive(0, desc->channel[i].size);
-
-                       values[i] = color->ui[i] != 0U;
-                       if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max)
-                               return true; /* need ELIMINATE_FAST_CLEAR */
-               } else {
-                       values[i] = color->f[i] != 0.0F;
-                       if (color->f[i] != 0.0F && color->f[i] != 1.0F)
-                               return true; /* need ELIMINATE_FAST_CLEAR */
-               }
-
-               if (desc->swizzle[i] == alpha_channel) {
-                       alpha_value = values[i];
-                       has_alpha = true;
-               } else {
-                       color_value = values[i];
-                       has_color = true;
-               }
-       }
-
-       /* If alpha isn't present, make it the same as color, and vice versa. */
-       if (!has_alpha)
-               alpha_value = color_value;
-       else if (!has_color)
-               color_value = alpha_value;
-
-       if (color_value != alpha_value &&
-           base_alpha_is_on_msb != surf_alpha_is_on_msb)
-               return true; /* require ELIMINATE_FAST_CLEAR */
-
-       /* Check if all color values are equal if they are present. */
-       for (int i = 0; i < 4; ++i) {
-               if (desc->swizzle[i] <= PIPE_SWIZZLE_W &&
-                   desc->swizzle[i] != alpha_channel &&
-                   values[i] != color_value)
-                       return true; /* require ELIMINATE_FAST_CLEAR */
-       }
-
-       /* This doesn't need ELIMINATE_FAST_CLEAR.
-        * On chips predating Raven2, the DCC clear codes and the CB clear
-        * color registers must match.
-        */
-       *eliminate_needed = false;
-
-       if (color_value) {
-               if (alpha_value)
-                       *clear_value = DCC_CLEAR_COLOR_1111;
-               else
-                       *clear_value = DCC_CLEAR_COLOR_1110;
-       } else {
-               if (alpha_value)
-                       *clear_value = DCC_CLEAR_COLOR_0001;
-               else
-                       *clear_value = DCC_CLEAR_COLOR_0000;
-       }
-       return true;
+   /* If we want to clear without needing a fast clear eliminate step, we
+    * can set color and alpha independently to 0 or 1 (or 0/max for integer
+    * formats).
+    */
+   bool values[4] = {};      /* whether to clear to 0 or 1 */
+   bool color_value = false; /* clear color to 0 or 1 */
+   bool alpha_value = false; /* clear alpha to 0 or 1 */
+   int alpha_channel;        /* index of the alpha component */
+   bool has_color = false;
+   bool has_alpha = false;
+
+   const struct util_format_description *desc =
+      util_format_description(si_simplify_cb_format(surface_format));
+
+   /* 128-bit fast clear with different R,G,B values is unsupported. */
+   if (desc->block.bits == 128 && (color->ui[0] != color->ui[1] || color->ui[0] != color->ui[2]))
+      return false;
+
+   *eliminate_needed = true;
+   *clear_value = DCC_CLEAR_COLOR_REG;
+
+   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+      return true; /* need ELIMINATE_FAST_CLEAR */
+
+   bool base_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, base_format);
+   bool surf_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, surface_format);
+
+   /* Formats with 3 channels can't have alpha. */
+   if (desc->nr_channels == 3)
+      alpha_channel = -1;
+   else if (surf_alpha_is_on_msb)
+      alpha_channel = desc->nr_channels - 1;
+   else
+      alpha_channel = 0;
+
+   for (int i = 0; i < 4; ++i) {
+      if (desc->swizzle[i] >= PIPE_SWIZZLE_0)
+         continue;
+
+      if (desc->channel[i].pure_integer && desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
+         /* Use the maximum value for clamping the clear color. */
+         int max = u_bit_consecutive(0, desc->channel[i].size - 1);
+
+         values[i] = color->i[i] != 0;
+         if (color->i[i] != 0 && MIN2(color->i[i], max) != max)
+            return true; /* need ELIMINATE_FAST_CLEAR */
+      } else if (desc->channel[i].pure_integer &&
+                 desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+         /* Use the maximum value for clamping the clear color. */
+         unsigned max = u_bit_consecutive(0, desc->channel[i].size);
+
+         values[i] = color->ui[i] != 0U;
+         if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max)
+            return true; /* need ELIMINATE_FAST_CLEAR */
+      } else {
+         values[i] = color->f[i] != 0.0F;
+         if (color->f[i] != 0.0F && color->f[i] != 1.0F)
+            return true; /* need ELIMINATE_FAST_CLEAR */
+      }
+
+      if (desc->swizzle[i] == alpha_channel) {
+         alpha_value = values[i];
+         has_alpha = true;
+      } else {
+         color_value = values[i];
+         has_color = true;
+      }
+   }
+
+   /* If alpha isn't present, make it the same as color, and vice versa. */
+   if (!has_alpha)
+      alpha_value = color_value;
+   else if (!has_color)
+      color_value = alpha_value;
+
+   if (color_value != alpha_value && base_alpha_is_on_msb != surf_alpha_is_on_msb)
+      return true; /* require ELIMINATE_FAST_CLEAR */
+
+   /* Check if all color values are equal if they are present. */
+   for (int i = 0; i < 4; ++i) {
+      if (desc->swizzle[i] <= PIPE_SWIZZLE_W && desc->swizzle[i] != alpha_channel &&
+          values[i] != color_value)
+         return true; /* require ELIMINATE_FAST_CLEAR */
+   }
+
+   /* This doesn't need ELIMINATE_FAST_CLEAR.
+    * On chips predating Raven2, the DCC clear codes and the CB clear
+    * color registers must match.
+    */
+   *eliminate_needed = false;
+
+   if (color_value) {
+      if (alpha_value)
+         *clear_value = DCC_CLEAR_COLOR_1111;
+      else
+         *clear_value = DCC_CLEAR_COLOR_1110;
+   } else {
+      if (alpha_value)
+         *clear_value = DCC_CLEAR_COLOR_0001;
+      else
+         *clear_value = DCC_CLEAR_COLOR_0000;
+   }
+   return true;
  }
  
-bool vi_dcc_clear_level(struct si_context *sctx,
-                       struct si_texture *tex,
-                       unsigned level, unsigned clear_value)
+bool vi_dcc_clear_level(struct si_context *sctx, struct si_texture *tex, unsigned level,
+                        unsigned clear_value)
  {
-       struct pipe_resource *dcc_buffer;
-       uint64_t dcc_offset, clear_size;
-
-       assert(vi_dcc_enabled(tex, level));
-
-       if (tex->dcc_separate_buffer) {
-               dcc_buffer = &tex->dcc_separate_buffer->b.b;
-               dcc_offset = 0;
-       } else {
-               dcc_buffer = &tex->buffer.b.b;
-               dcc_offset = tex->surface.dcc_offset;
-       }
-
-       if (sctx->chip_class >= GFX9) {
-               /* Mipmap level clears aren't implemented. */
-               if (tex->buffer.b.b.last_level > 0)
-                       return false;
-
-               /* 4x and 8x MSAA needs a sophisticated compute shader for
-                * the clear. See AMDVLK. */
-               if (tex->buffer.b.b.nr_storage_samples >= 4)
-                       return false;
-
-               clear_size = tex->surface.dcc_size;
-       } else {
-               unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
-
-               /* If this is 0, fast clear isn't possible. (can occur with MSAA) */
-               if (!tex->surface.u.legacy.level[level].dcc_fast_clear_size)
-                       return false;
-
-               /* Layered 4x and 8x MSAA DCC fast clears need to clear
-                * dcc_fast_clear_size bytes for each layer. A compute shader
-                * would be more efficient than separate per-layer clear operations.
-                */
-               if (tex->buffer.b.b.nr_storage_samples >= 4 && num_layers > 1)
-                       return false;
-
-               dcc_offset += tex->surface.u.legacy.level[level].dcc_offset;
-               clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size *
-                            num_layers;
-       }
-
-       si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
-                       &clear_value, 4, SI_COHERENCY_CB_META, false);
-       return true;
+   struct pipe_resource *dcc_buffer;
+   uint64_t dcc_offset, clear_size;
+
+   assert(vi_dcc_enabled(tex, level));
+
+   if (tex->dcc_separate_buffer) {
+      dcc_buffer = &tex->dcc_separate_buffer->b.b;
+      dcc_offset = 0;
+   } else {
+      dcc_buffer = &tex->buffer.b.b;
+      dcc_offset = tex->surface.dcc_offset;
+   }
+
+   if (sctx->chip_class >= GFX9) {
+      /* Mipmap level clears aren't implemented. */
+      if (tex->buffer.b.b.last_level > 0)
+         return false;
+
+      /* 4x and 8x MSAA needs a sophisticated compute shader for
+       * the clear. See AMDVLK. */
+      if (tex->buffer.b.b.nr_storage_samples >= 4)
+         return false;
+
+      clear_size = tex->surface.dcc_size;
+   } else {
+      unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
+
+      /* If this is 0, fast clear isn't possible. (can occur with MSAA) */
+      if (!tex->surface.u.legacy.level[level].dcc_fast_clear_size)
+         return false;
+
+      /* Layered 4x and 8x MSAA DCC fast clears need to clear
+       * dcc_fast_clear_size bytes for each layer. A compute shader
+       * would be more efficient than separate per-layer clear operations.
+       */
+      if (tex->buffer.b.b.nr_storage_samples >= 4 && num_layers > 1)
+         return false;
+
+      dcc_offset += tex->surface.u.legacy.level[level].dcc_offset;
+      clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size * num_layers;
+   }
+
+   si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size, &clear_value, 4, SI_COHERENCY_CB_META,
+                   false);
+   return true;
  }
  
  /* Set the same micro tile mode as the destination of the last MSAA resolve.
   * This allows hitting the MSAA resolve fast path, which requires that both
   * src and dst micro tile modes match.
   */
-static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen,
-                                          struct si_texture *tex)
+static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen, struct si_texture *tex)
  {
-       if (sscreen->info.chip_class >= GFX10 ||
-           tex->buffer.b.is_shared ||
-           tex->buffer.b.b.nr_samples <= 1 ||
-           tex->surface.micro_tile_mode == tex->last_msaa_resolve_target_micro_mode)
-               return;
-
-       assert(sscreen->info.chip_class >= GFX9 ||
-              tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
-       assert(tex->buffer.b.b.last_level == 0);
-
-       if (sscreen->info.chip_class >= GFX9) {
-               /* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */
-               assert(tex->surface.u.gfx9.surf.swizzle_mode >= 4);
-
-               /* If you do swizzle_mode % 4, you'll get:
-                *   0 = Depth
-                *   1 = Standard,
-                *   2 = Displayable
-                *   3 = Rotated
-                *
-                * Depth-sample order isn't allowed:
-                */
-               assert(tex->surface.u.gfx9.surf.swizzle_mode % 4 != 0);
-
-               switch (tex->last_msaa_resolve_target_micro_mode) {
-               case RADEON_MICRO_MODE_DISPLAY:
-                       tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
-                       tex->surface.u.gfx9.surf.swizzle_mode += 2; /* D */
-                       break;
-               case RADEON_MICRO_MODE_THIN:
-                       tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
-                       tex->surface.u.gfx9.surf.swizzle_mode += 1; /* S */
-                       break;
-               case RADEON_MICRO_MODE_ROTATED:
-                       tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
-                       tex->surface.u.gfx9.surf.swizzle_mode += 3; /* R */
-                       break;
-               default: /* depth */
-                       assert(!"unexpected micro mode");
-                       return;
-               }
-       } else if (sscreen->info.chip_class >= GFX7) {
-               /* These magic numbers were copied from addrlib. It doesn't use
-                * any definitions for them either. They are all 2D_TILED_THIN1
-                * modes with different bpp and micro tile mode.
-                */
-               switch (tex->last_msaa_resolve_target_micro_mode) {
-               case RADEON_MICRO_MODE_DISPLAY:
-                       tex->surface.u.legacy.tiling_index[0] = 10;
-                       break;
-               case RADEON_MICRO_MODE_THIN:
-                       tex->surface.u.legacy.tiling_index[0] = 14;
-                       break;
-               case RADEON_MICRO_MODE_ROTATED:
-                       tex->surface.u.legacy.tiling_index[0] = 28;
-                       break;
-               default: /* depth, thick */
-                       assert(!"unexpected micro mode");
-                       return;
-               }
-       } else { /* GFX6 */
-               switch (tex->last_msaa_resolve_target_micro_mode) {
-               case RADEON_MICRO_MODE_DISPLAY:
-                       switch (tex->surface.bpe) {
-                       case 1:
-                            tex->surface.u.legacy.tiling_index[0] = 10;
-                            break;
-                       case 2:
-                            tex->surface.u.legacy.tiling_index[0] = 11;
-                            break;
-                       default: /* 4, 8 */
-                            tex->surface.u.legacy.tiling_index[0] = 12;
-                            break;
-                       }
-                       break;
-               case RADEON_MICRO_MODE_THIN:
-                       switch (tex->surface.bpe) {
-                       case 1:
-                                tex->surface.u.legacy.tiling_index[0] = 14;
-                                break;
-                       case 2:
-                                tex->surface.u.legacy.tiling_index[0] = 15;
-                                break;
-                       case 4:
-                                tex->surface.u.legacy.tiling_index[0] = 16;
-                                break;
-                       default: /* 8, 16 */
-                                tex->surface.u.legacy.tiling_index[0] = 17;
-                                break;
-                       }
-                       break;
-               default: /* depth, thick */
-                       assert(!"unexpected micro mode");
-                       return;
-               }
-       }
-
-       tex->surface.micro_tile_mode = tex->last_msaa_resolve_target_micro_mode;
-
-       p_atomic_inc(&sscreen->dirty_tex_counter);
+   if (sscreen->info.chip_class >= GFX10 || tex->buffer.b.is_shared ||
+       tex->buffer.b.b.nr_samples <= 1 ||
+       tex->surface.micro_tile_mode == tex->last_msaa_resolve_target_micro_mode)
+      return;
+
+   assert(sscreen->info.chip_class >= GFX9 ||
+          tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
+   assert(tex->buffer.b.b.last_level == 0);
+
+   if (sscreen->info.chip_class >= GFX9) {
+      /* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */
+      assert(tex->surface.u.gfx9.surf.swizzle_mode >= 4);
+
+      /* If you do swizzle_mode % 4, you'll get:
+       *   0 = Depth
+       *   1 = Standard,
+       *   2 = Displayable
+       *   3 = Rotated
+       *
+       * Depth-sample order isn't allowed:
+       */
+      assert(tex->surface.u.gfx9.surf.swizzle_mode % 4 != 0);
+
+      switch (tex->last_msaa_resolve_target_micro_mode) {
+      case RADEON_MICRO_MODE_DISPLAY:
+         tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+         tex->surface.u.gfx9.surf.swizzle_mode += 2; /* D */
+         break;
+      case RADEON_MICRO_MODE_THIN:
+         tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+         tex->surface.u.gfx9.surf.swizzle_mode += 1; /* S */
+         break;
+      case RADEON_MICRO_MODE_ROTATED:
+         tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+         tex->surface.u.gfx9.surf.swizzle_mode += 3; /* R */
+         break;
+      default: /* depth */
+         assert(!"unexpected micro mode");
+         return;
+      }
+   } else if (sscreen->info.chip_class >= GFX7) {
+      /* These magic numbers were copied from addrlib. It doesn't use
+       * any definitions for them either. They are all 2D_TILED_THIN1
+       * modes with different bpp and micro tile mode.
+       */
+      switch (tex->last_msaa_resolve_target_micro_mode) {
+      case RADEON_MICRO_MODE_DISPLAY:
+         tex->surface.u.legacy.tiling_index[0] = 10;
+         break;
+      case RADEON_MICRO_MODE_THIN:
+         tex->surface.u.legacy.tiling_index[0] = 14;
+         break;
+      case RADEON_MICRO_MODE_ROTATED:
+         tex->surface.u.legacy.tiling_index[0] = 28;
+         break;
+      default: /* depth, thick */
+         assert(!"unexpected micro mode");
+         return;
+      }
+   } else { /* GFX6 */
+      switch (tex->last_msaa_resolve_target_micro_mode) {
+      case RADEON_MICRO_MODE_DISPLAY:
+         switch (tex->surface.bpe) {
+         case 1:
+            tex->surface.u.legacy.tiling_index[0] = 10;
+            break;
+         case 2:
+            tex->surface.u.legacy.tiling_index[0] = 11;
+            break;
+         default: /* 4, 8 */
+            tex->surface.u.legacy.tiling_index[0] = 12;
+            break;
+         }
+         break;
+      case RADEON_MICRO_MODE_THIN:
+         switch (tex->surface.bpe) {
+         case 1:
+            tex->surface.u.legacy.tiling_index[0] = 14;
+            break;
+         case 2:
+            tex->surface.u.legacy.tiling_index[0] = 15;
+            break;
+         case 4:
+            tex->surface.u.legacy.tiling_index[0] = 16;
+            break;
+         default: /* 8, 16 */
+            tex->surface.u.legacy.tiling_index[0] = 17;
+            break;
+         }
+         break;
+      default: /* depth, thick */
+         assert(!"unexpected micro mode");
+         return;
+      }
+   }
+
+   tex->surface.micro_tile_mode = tex->last_msaa_resolve_target_micro_mode;
+
+   p_atomic_inc(&sscreen->dirty_tex_counter);
  }
  
-static void si_do_fast_color_clear(struct si_context *sctx,
-                                  unsigned *buffers,
-                                  const union pipe_color_union *color)
+static void si_do_fast_color_clear(struct si_context *sctx, unsigned *buffers,
+                                   const union pipe_color_union *color)
  {
-       struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
-       int i;
+   struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
+   int i;
  
-       /* This function is broken in BE, so just disable this path for now */
+   /* This function is broken in BE, so just disable this path for now */
  #if UTIL_ARCH_BIG_ENDIAN
-       return;
+   return;
  #endif
  
-       if (sctx->render_cond)
-               return;
-
-       for (i = 0; i < fb->nr_cbufs; i++) {
-               struct si_texture *tex;
-               unsigned clear_bit = PIPE_CLEAR_COLOR0 << i;
-
-               if (!fb->cbufs[i])
-                       continue;
-
-               /* if this colorbuffer is not being cleared */
-               if (!(*buffers & clear_bit))
-                       continue;
-
-               unsigned level = fb->cbufs[i]->u.tex.level;
-               if (level > 0)
-                       continue;
-
-               tex = (struct si_texture *)fb->cbufs[i]->texture;
-
-               /* TODO: GFX9: Implement DCC fast clear for level 0 of
-                * mipmapped textures. Mipmapped DCC has to clear a rectangular
-                * area of DCC for level 0 (because the whole miptree is
-                * organized in a 2D plane).
-                */
-               if (sctx->chip_class >= GFX9 &&
-                   tex->buffer.b.b.last_level > 0)
-                       continue;
-
-               /* the clear is allowed if all layers are bound */
-               if (fb->cbufs[i]->u.tex.first_layer != 0 ||
-                   fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->buffer.b.b, 0)) {
-                       continue;
-               }
-
-               /* only supported on tiled surfaces */
-               if (tex->surface.is_linear) {
-                       continue;
-               }
-
-               /* shared textures can't use fast clear without an explicit flush,
-                * because there is no way to communicate the clear color among
-                * all clients
-                */
-               if (tex->buffer.b.is_shared &&
-                   !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
-                       continue;
-
-               if (sctx->chip_class <= GFX8 &&
-                   tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
-                   !sctx->screen->info.htile_cmask_support_1d_tiling)
-                       continue;
-
-               /* Use a slow clear for small surfaces where the cost of
-                * the eliminate pass can be higher than the benefit of fast
-                * clear. The closed driver does this, but the numbers may differ.
-                *
-                * This helps on both dGPUs and APUs, even small APUs like Mullins.
-                */
-               bool too_small = tex->buffer.b.b.nr_samples <= 1 &&
-                                tex->buffer.b.b.width0 *
-                                tex->buffer.b.b.height0 <= 512 * 512;
-               bool eliminate_needed = false;
-               bool fmask_decompress_needed = false;
-
-               /* Fast clear is the most appropriate place to enable DCC for
-                * displayable surfaces.
-                */
-               if (sctx->family == CHIP_STONEY && !too_small) {
-                       vi_separate_dcc_try_enable(sctx, tex);
-
-                       /* RB+ isn't supported with a CMASK clear only on Stoney,
-                        * so all clears are considered to be hypothetically slow
-                        * clears, which is weighed when determining whether to
-                        * enable separate DCC.
-                        */
-                       if (tex->dcc_gather_statistics) /* only for Stoney */
-                               tex->num_slow_clears++;
-               }
-
-               /* Try to clear DCC first, otherwise try CMASK. */
-               if (vi_dcc_enabled(tex, 0)) {
-                       uint32_t reset_value;
-
-                       if (sctx->screen->debug_flags & DBG(NO_DCC_CLEAR))
-                               continue;
-
-                       if (!vi_get_fast_clear_parameters(sctx->screen,
-                                                         tex->buffer.b.b.format,
-                                                         fb->cbufs[i]->format,
-                                                         color, &reset_value,
-                                                         &eliminate_needed))
-                               continue;
-
-                       if (eliminate_needed && too_small)
-                               continue;
-
-                       /* TODO: This DCC+CMASK clear doesn't work with MSAA. */
-                       if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer &&
-                           eliminate_needed)
-                               continue;
-
-                       if (!vi_dcc_clear_level(sctx, tex, 0, reset_value))
-                               continue;
-
-                       tex->separate_dcc_dirty = true;
-                       tex->displayable_dcc_dirty = true;
-
-                       /* DCC fast clear with MSAA should clear CMASK to 0xC. */
-                       if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
-                               uint32_t clear_value = 0xCCCCCCCC;
-                               si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
-                                               tex->surface.cmask_offset, tex->surface.cmask_size,
-                                               &clear_value, 4, SI_COHERENCY_CB_META, false);
-                               fmask_decompress_needed = true;
-                       }
-               } else {
-                       if (too_small)
-                               continue;
-
-                       /* 128-bit formats are unusupported */
-                       if (tex->surface.bpe > 8) {
-                               continue;
-                       }
-
-                       /* RB+ doesn't work with CMASK fast clear on Stoney. */
-                       if (sctx->family == CHIP_STONEY)
-                               continue;
-
-                       /* ensure CMASK is enabled */
-                       si_alloc_separate_cmask(sctx->screen, tex);
-                       if (!tex->cmask_buffer)
-                               continue;
-
-                       /* Do the fast clear. */
-                       uint32_t clear_value = 0;
-                       si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
-                                       tex->surface.cmask_offset, tex->surface.cmask_size,
-                                       &clear_value, 4, SI_COHERENCY_CB_META, false);
-                       eliminate_needed = true;
-               }
-
-               if ((eliminate_needed || fmask_decompress_needed) &&
-                   !(tex->dirty_level_mask & (1 << level))) {
-                       tex->dirty_level_mask |= 1 << level;
-                       p_atomic_inc(&sctx->screen->compressed_colortex_counter);
-               }
-
-               /* We can change the micro tile mode before a full clear. */
-               si_set_optimal_micro_tile_mode(sctx->screen, tex);
-
-               *buffers &= ~clear_bit;
-
-               /* Chips with DCC constant encoding don't need to set the clear
-                * color registers for DCC clear values 0 and 1.
-                */
-               if (sctx->screen->info.has_dcc_constant_encode && !eliminate_needed)
-                       continue;
-
-               if (si_set_clear_color(tex, fb->cbufs[i]->format, color)) {
-                       sctx->framebuffer.dirty_cbufs |= 1 << i;
-                       si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
-               }
-       }
+   if (sctx->render_cond)
+      return;
+
+   for (i = 0; i < fb->nr_cbufs; i++) {
+      struct si_texture *tex;
+      unsigned clear_bit = PIPE_CLEAR_COLOR0 << i;
+
+      if (!fb->cbufs[i])
+         continue;
+
+      /* if this colorbuffer is not being cleared */
+      if (!(*buffers & clear_bit))
+         continue;
+
+      unsigned level = fb->cbufs[i]->u.tex.level;
+      if (level > 0)
+         continue;
+
+      tex = (struct si_texture *)fb->cbufs[i]->texture;
+
+      /* TODO: GFX9: Implement DCC fast clear for level 0 of
+       * mipmapped textures. Mipmapped DCC has to clear a rectangular
+       * area of DCC for level 0 (because the whole miptree is
+       * organized in a 2D plane).
+       */
+      if (sctx->chip_class >= GFX9 && tex->buffer.b.b.last_level > 0)
+         continue;
+
+      /* the clear is allowed if all layers are bound */
+      if (fb->cbufs[i]->u.tex.first_layer != 0 ||
+          fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->buffer.b.b, 0)) {
+         continue;
+      }
+
+      /* only supported on tiled surfaces */
+      if (tex->surface.is_linear) {
+         continue;
+      }
+
+      /* shared textures can't use fast clear without an explicit flush,
+       * because there is no way to communicate the clear color among
+       * all clients
+       */
+      if (tex->buffer.b.is_shared &&
+          !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+         continue;
+
+      if (sctx->chip_class <= GFX8 && tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
+          !sctx->screen->info.htile_cmask_support_1d_tiling)
+         continue;
+
+      /* Use a slow clear for small surfaces where the cost of
+       * the eliminate pass can be higher than the benefit of fast
+       * clear. The closed driver does this, but the numbers may differ.
+       *
+       * This helps on both dGPUs and APUs, even small APUs like Mullins.
+       */
+      bool too_small = tex->buffer.b.b.nr_samples <= 1 &&
+                       tex->buffer.b.b.width0 * tex->buffer.b.b.height0 <= 512 * 512;
+      bool eliminate_needed = false;
+      bool fmask_decompress_needed = false;
+
+      /* Fast clear is the most appropriate place to enable DCC for
+       * displayable surfaces.
+       */
+      if (sctx->family == CHIP_STONEY && !too_small) {
+         vi_separate_dcc_try_enable(sctx, tex);
+
+         /* RB+ isn't supported with a CMASK clear only on Stoney,
+          * so all clears are considered to be hypothetically slow
+          * clears, which is weighed when determining whether to
+          * enable separate DCC.
+          */
+         if (tex->dcc_gather_statistics) /* only for Stoney */
+            tex->num_slow_clears++;
+      }
+
+      /* Try to clear DCC first, otherwise try CMASK. */
+      if (vi_dcc_enabled(tex, 0)) {
+         uint32_t reset_value;
+
+         if (sctx->screen->debug_flags & DBG(NO_DCC_CLEAR))
+            continue;
+
+         if (!vi_get_fast_clear_parameters(sctx->screen, tex->buffer.b.b.format,
+                                           fb->cbufs[i]->format, color, &reset_value,
+                                           &eliminate_needed))
+            continue;
+
+         if (eliminate_needed && too_small)
+            continue;
+
+         /* TODO: This DCC+CMASK clear doesn't work with MSAA. */
+         if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer && eliminate_needed)
+            continue;
+
+         if (!vi_dcc_clear_level(sctx, tex, 0, reset_value))
+            continue;
+
+         tex->separate_dcc_dirty = true;
+         tex->displayable_dcc_dirty = true;
+
+         /* DCC fast clear with MSAA should clear CMASK to 0xC. */
+         if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
+            uint32_t clear_value = 0xCCCCCCCC;
+            si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->surface.cmask_offset,
+                            tex->surface.cmask_size, &clear_value, 4, SI_COHERENCY_CB_META, false);
+            fmask_decompress_needed = true;
+         }
+      } else {
+         if (too_small)
+            continue;
+
+         /* 128-bit formats are unusupported */
+         if (tex->surface.bpe > 8) {
+            continue;
+         }
+
+         /* RB+ doesn't work with CMASK fast clear on Stoney. */
+         if (sctx->family == CHIP_STONEY)
+            continue;
+
+         /* ensure CMASK is enabled */
+         si_alloc_separate_cmask(sctx->screen, tex);
+         if (!tex->cmask_buffer)
+            continue;
+
+         /* Do the fast clear. */
+         uint32_t clear_value = 0;
+         si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->surface.cmask_offset,
+                         tex->surface.cmask_size, &clear_value, 4, SI_COHERENCY_CB_META, false);
+         eliminate_needed = true;
+      }
+
+      if ((eliminate_needed || fmask_decompress_needed) &&
+          !(tex->dirty_level_mask & (1 << level))) {
+         tex->dirty_level_mask |= 1 << level;
+         p_atomic_inc(&sctx->screen->compressed_colortex_counter);
+      }
+
+      /* We can change the micro tile mode before a full clear. */
+      si_set_optimal_micro_tile_mode(sctx->screen, tex);
+
+      *buffers &= ~clear_bit;
+
+      /* Chips with DCC constant encoding don't need to set the clear
+       * color registers for DCC clear values 0 and 1.
+       */
+      if (sctx->screen->info.has_dcc_constant_encode && !eliminate_needed)
+         continue;
+
+      if (si_set_clear_color(tex, fb->cbufs[i]->format, color)) {
+         sctx->framebuffer.dirty_cbufs |= 1 << i;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+      }
+   }
  }
  
  static void si_clear(struct pipe_context *ctx, unsigned buffers,
-                    const union pipe_color_union *color,
-                    double depth, unsigned stencil)
+                     const union pipe_color_union *color, double depth, unsigned stencil)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
-       struct pipe_surface *zsbuf = fb->zsbuf;
-       struct si_texture *zstex =
-               zsbuf ? (struct si_texture*)zsbuf->texture : NULL;
-       bool needs_db_flush = false;
-
-       if (buffers & PIPE_CLEAR_COLOR) {
-               si_do_fast_color_clear(sctx, &buffers, color);
-               if (!buffers)
-                       return; /* all buffers have been fast cleared */
-
-               /* These buffers cannot use fast clear, make sure to disable expansion. */
-               for (unsigned i = 0; i < fb->nr_cbufs; i++) {
-                       struct si_texture *tex;
-
-                       /* If not clearing this buffer, skip. */
-                       if (!(buffers & (PIPE_CLEAR_COLOR0 << i)) || !fb->cbufs[i])
-                               continue;
-
-                       tex = (struct si_texture *)fb->cbufs[i]->texture;
-                       if (tex->surface.fmask_size == 0)
-                               tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level);
-               }
-       }
-
-       if (zstex &&
-           zsbuf->u.tex.first_layer == 0 &&
-           zsbuf->u.tex.last_layer == util_max_layer(&zstex->buffer.b.b, 0)) {
-               /* TC-compatible HTILE only supports depth clears to 0 or 1. */
-               if (buffers & PIPE_CLEAR_DEPTH &&
-                   si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_Z) &&
-                   (!zstex->tc_compatible_htile ||
-                    depth == 0 || depth == 1)) {
-                       /* Need to disable EXPCLEAR temporarily if clearing
-                        * to a new value. */
-                       if (!zstex->depth_cleared || zstex->depth_clear_value != depth) {
-                               sctx->db_depth_disable_expclear = true;
-                       }
-
-                       if (zstex->depth_clear_value != (float)depth) {
-                               if ((zstex->depth_clear_value != 0) != (depth != 0)) {
-                                       /* ZRANGE_PRECISION register of a bound surface will change so we
-                                        * must flush the DB caches. */
-                                       needs_db_flush = true;
-                               }
-                               /* Update DB_DEPTH_CLEAR. */
-                               zstex->depth_clear_value = depth;
-                               sctx->framebuffer.dirty_zsbuf = true;
-                               si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
-                       }
-                       sctx->db_depth_clear = true;
-                       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-               }
-
-               /* TC-compatible HTILE only supports stencil clears to 0. */
-               if (buffers & PIPE_CLEAR_STENCIL &&
-                   si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_S) &&
-                   (!zstex->tc_compatible_htile || stencil == 0)) {
-                       stencil &= 0xff;
-
-                       /* Need to disable EXPCLEAR temporarily if clearing
-                        * to a new value. */
-                       if (!zstex->stencil_cleared || zstex->stencil_clear_value != stencil) {
-                               sctx->db_stencil_disable_expclear = true;
-                       }
-
-                       if (zstex->stencil_clear_value != (uint8_t)stencil) {
-                               /* Update DB_STENCIL_CLEAR. */
-                               zstex->stencil_clear_value = stencil;
-                               sctx->framebuffer.dirty_zsbuf = true;
-                               si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
-                       }
-                       sctx->db_stencil_clear = true;
-                       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-               }
-
-               if (needs_db_flush)
-                       sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
-       }
-
-       si_blitter_begin(sctx, SI_CLEAR);
-       util_blitter_clear(sctx->blitter, fb->width, fb->height,
-                          util_framebuffer_get_num_layers(fb),
-                          buffers, color, depth, stencil,
-                          sctx->framebuffer.nr_samples > 1);
-       si_blitter_end(sctx);
-
-       if (sctx->db_depth_clear) {
-               sctx->db_depth_clear = false;
-               sctx->db_depth_disable_expclear = false;
-               zstex->depth_cleared = true;
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-       }
-
-       if (sctx->db_stencil_clear) {
-               sctx->db_stencil_clear = false;
-               sctx->db_stencil_disable_expclear = false;
-               zstex->stencil_cleared = true;
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-       }
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
+   struct pipe_surface *zsbuf = fb->zsbuf;
+   struct si_texture *zstex = zsbuf ? (struct si_texture *)zsbuf->texture : NULL;
+   bool needs_db_flush = false;
+
+   if (buffers & PIPE_CLEAR_COLOR) {
+      si_do_fast_color_clear(sctx, &buffers, color);
+      if (!buffers)
+         return; /* all buffers have been fast cleared */
+
+      /* These buffers cannot use fast clear, make sure to disable expansion. */
+      for (unsigned i = 0; i < fb->nr_cbufs; i++) {
+         struct si_texture *tex;
+
+         /* If not clearing this buffer, skip. */
+         if (!(buffers & (PIPE_CLEAR_COLOR0 << i)) || !fb->cbufs[i])
+            continue;
+
+         tex = (struct si_texture *)fb->cbufs[i]->texture;
+         if (tex->surface.fmask_size == 0)
+            tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level);
+      }
+   }
+
+   if (zstex && zsbuf->u.tex.first_layer == 0 &&
+       zsbuf->u.tex.last_layer == util_max_layer(&zstex->buffer.b.b, 0)) {
+      /* TC-compatible HTILE only supports depth clears to 0 or 1. */
+      if (buffers & PIPE_CLEAR_DEPTH && si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_Z) &&
+          (!zstex->tc_compatible_htile || depth == 0 || depth == 1)) {
+         /* Need to disable EXPCLEAR temporarily if clearing
+          * to a new value. */
+         if (!zstex->depth_cleared || zstex->depth_clear_value != depth) {
+            sctx->db_depth_disable_expclear = true;
+         }
+
+         if (zstex->depth_clear_value != (float)depth) {
+            if ((zstex->depth_clear_value != 0) != (depth != 0)) {
+               /* ZRANGE_PRECISION register of a bound surface will change so we
+                * must flush the DB caches. */
+               needs_db_flush = true;
+            }
+            /* Update DB_DEPTH_CLEAR. */
+            zstex->depth_clear_value = depth;
+            sctx->framebuffer.dirty_zsbuf = true;
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+         }
+         sctx->db_depth_clear = true;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+      }
+
+      /* TC-compatible HTILE only supports stencil clears to 0. */
+      if (buffers & PIPE_CLEAR_STENCIL &&
+          si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_S) &&
+          (!zstex->tc_compatible_htile || stencil == 0)) {
+         stencil &= 0xff;
+
+         /* Need to disable EXPCLEAR temporarily if clearing
+          * to a new value. */
+         if (!zstex->stencil_cleared || zstex->stencil_clear_value != stencil) {
+            sctx->db_stencil_disable_expclear = true;
+         }
+
+         if (zstex->stencil_clear_value != (uint8_t)stencil) {
+            /* Update DB_STENCIL_CLEAR. */
+            zstex->stencil_clear_value = stencil;
+            sctx->framebuffer.dirty_zsbuf = true;
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+         }
+         sctx->db_stencil_clear = true;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+      }
+
+      if (needs_db_flush)
+         sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
+   }
+
+   si_blitter_begin(sctx, SI_CLEAR);
+   util_blitter_clear(sctx->blitter, fb->width, fb->height, util_framebuffer_get_num_layers(fb),
+                      buffers, color, depth, stencil, sctx->framebuffer.nr_samples > 1);
+   si_blitter_end(sctx);
+
+   if (sctx->db_depth_clear) {
+      sctx->db_depth_clear = false;
+      sctx->db_depth_disable_expclear = false;
+      zstex->depth_cleared = true;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   }
+
+   if (sctx->db_stencil_clear) {
+      sctx->db_stencil_clear = false;
+      sctx->db_stencil_disable_expclear = false;
+      zstex->stencil_cleared = true;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   }
  }
  
-static void si_clear_render_target(struct pipe_context *ctx,
-                                  struct pipe_surface *dst,
-                                  const union pipe_color_union *color,
-                                  unsigned dstx, unsigned dsty,
-                                  unsigned width, unsigned height,
-                                  bool render_condition_enabled)
+static void si_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dst,
+                                   const union pipe_color_union *color, unsigned dstx,
+                                   unsigned dsty, unsigned width, unsigned height,
+                                   bool render_condition_enabled)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_texture *sdst = (struct si_texture*)dst->texture;
-
-       if (dst->texture->nr_samples <= 1 && !sdst->surface.dcc_offset) {
-               si_compute_clear_render_target(ctx, dst, color, dstx, dsty, width,
-                                              height, render_condition_enabled);
-               return;
-       }
-
-       si_blitter_begin(sctx, SI_CLEAR_SURFACE |
-                        (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
-       util_blitter_clear_render_target(sctx->blitter, dst, color,
-                                        dstx, dsty, width, height);
-       si_blitter_end(sctx);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *sdst = (struct si_texture *)dst->texture;
+
+   if (dst->texture->nr_samples <= 1 && !sdst->surface.dcc_offset) {
+      si_compute_clear_render_target(ctx, dst, color, dstx, dsty, width, height,
+                                     render_condition_enabled);
+      return;
+   }
+
+   si_blitter_begin(sctx,
+                    SI_CLEAR_SURFACE | (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
+   util_blitter_clear_render_target(sctx->blitter, dst, color, dstx, dsty, width, height);
+   si_blitter_end(sctx);
  }
  
-static void si_clear_depth_stencil(struct pipe_context *ctx,
-                                  struct pipe_surface *dst,
-                                  unsigned clear_flags,
-                                  double depth,
-                                  unsigned stencil,
-                                  unsigned dstx, unsigned dsty,
-                                  unsigned width, unsigned height,
-                                  bool render_condition_enabled)
+static void si_clear_depth_stencil(struct pipe_context *ctx, struct pipe_surface *dst,
+                                   unsigned clear_flags, double depth, unsigned stencil,
+                                   unsigned dstx, unsigned dsty, unsigned width, unsigned height,
+                                   bool render_condition_enabled)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
  
-       si_blitter_begin(sctx, SI_CLEAR_SURFACE |
-                        (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
-       util_blitter_clear_depth_stencil(sctx->blitter, dst, clear_flags, depth, stencil,
-                                        dstx, dsty, width, height);
-       si_blitter_end(sctx);
+   si_blitter_begin(sctx,
+                    SI_CLEAR_SURFACE | (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
+   util_blitter_clear_depth_stencil(sctx->blitter, dst, clear_flags, depth, stencil, dstx, dsty,
+                                    width, height);
+   si_blitter_end(sctx);
  }
  
-static void si_clear_texture(struct pipe_context *pipe,
-                            struct pipe_resource *tex,
-                            unsigned level,
-                            const struct pipe_box *box,
-                            const void *data)
+static void si_clear_texture(struct pipe_context *pipe, struct pipe_resource *tex, unsigned level,
+                             const struct pipe_box *box, const void *data)
  {
-       struct pipe_screen *screen = pipe->screen;
-       struct si_texture *stex = (struct si_texture*)tex;
-       struct pipe_surface tmpl = {{0}};
-       struct pipe_surface *sf;
-
-       tmpl.format = tex->format;
-       tmpl.u.tex.first_layer = box->z;
-       tmpl.u.tex.last_layer = box->z + box->depth - 1;
-       tmpl.u.tex.level = level;
-       sf = pipe->create_surface(pipe, tex, &tmpl);
-       if (!sf)
-               return;
-
-       if (stex->is_depth) {
-               unsigned clear;
-               float depth;
-               uint8_t stencil = 0;
-
-               /* Depth is always present. */
-               clear = PIPE_CLEAR_DEPTH;
-               util_format_unpack_z_float(tex->format, &depth, data, 1);
-
-               if (stex->surface.has_stencil) {
-                       clear |= PIPE_CLEAR_STENCIL;
-                       util_format_unpack_s_8uint(tex->format,
-                                                  &stencil, data, 1);
-               }
-
-               si_clear_depth_stencil(pipe, sf, clear, depth, stencil,
-                                      box->x, box->y,
-                                      box->width, box->height, false);
-       } else {
-               union pipe_color_union color;
-
-               util_format_unpack_rgba(tex->format, color.ui, data, 1);
-
-               if (screen->is_format_supported(screen, tex->format,
-                                               tex->target, 0, 0,
-                                               PIPE_BIND_RENDER_TARGET)) {
-                       si_clear_render_target(pipe, sf, &color,
-                                              box->x, box->y,
-                                              box->width, box->height, false);
-               } else {
-                       /* Software fallback - just for R9G9B9E5_FLOAT */
-                       util_clear_render_target(pipe, sf, &color,
-                                                box->x, box->y,
-                                                box->width, box->height);
-               }
-       }
-       pipe_surface_reference(&sf, NULL);
+   struct pipe_screen *screen = pipe->screen;
+   struct si_texture *stex = (struct si_texture *)tex;
+   struct pipe_surface tmpl = {{0}};
+   struct pipe_surface *sf;
+
+   tmpl.format = tex->format;
+   tmpl.u.tex.first_layer = box->z;
+   tmpl.u.tex.last_layer = box->z + box->depth - 1;
+   tmpl.u.tex.level = level;
+   sf = pipe->create_surface(pipe, tex, &tmpl);
+   if (!sf)
+      return;
+
+   if (stex->is_depth) {
+      unsigned clear;
+      float depth;
+      uint8_t stencil = 0;
+
+      /* Depth is always present. */
+      clear = PIPE_CLEAR_DEPTH;
+      util_format_unpack_z_float(tex->format, &depth, data, 1);
+
+      if (stex->surface.has_stencil) {
+         clear |= PIPE_CLEAR_STENCIL;
+         util_format_unpack_s_8uint(tex->format, &stencil, data, 1);
+      }
+
+      si_clear_depth_stencil(pipe, sf, clear, depth, stencil, box->x, box->y, box->width,
+                             box->height, false);
+   } else {
+      union pipe_color_union color;
+
+      util_format_unpack_rgba(tex->format, color.ui, data, 1);
+
+      if (screen->is_format_supported(screen, tex->format, tex->target, 0, 0,
+                                      PIPE_BIND_RENDER_TARGET)) {
+         si_clear_render_target(pipe, sf, &color, box->x, box->y, box->width, box->height, false);
+      } else {
+         /* Software fallback - just for R9G9B9E5_FLOAT */
+         util_clear_render_target(pipe, sf, &color, box->x, box->y, box->width, box->height);
+      }
+   }
+   pipe_surface_reference(&sf, NULL);
  }
  
  void si_init_clear_functions(struct si_context *sctx)
  {
-       sctx->b.clear_render_target = si_clear_render_target;
-       sctx->b.clear_texture = si_clear_texture;
+   sctx->b.clear_render_target = si_clear_render_target;
+   sctx->b.clear_texture = si_clear_texture;
  
-       if (sctx->has_graphics) {
-               sctx->b.clear = si_clear;
-               sctx->b.clear_depth_stencil = si_clear_depth_stencil;
-       }
+   if (sctx->has_graphics) {
+      sctx->b.clear = si_clear;
+      sctx->b.clear_depth_stencil = si_clear_depth_stencil;
+   }
  }
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c

index 610c13335974442268b5dd847b7e279a3cbe9bb2..5dca5730a58cd727ebe9a9adf84400eeba33d06a 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -23,972 +23,892 @@
   *
   */
  
-#include "nir/tgsi_to_nir.h"
-#include "util/u_async_debug.h"
-#include "util/u_memory.h"
-#include "util/u_upload_mgr.h"
+#include "si_compute.h"
  
  #include "ac_rtld.h"
  #include "amd_kernel_code_t.h"
+#include "nir/tgsi_to_nir.h"
  #include "si_build_pm4.h"
-#include "si_compute.h"
+#include "util/u_async_debug.h"
+#include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
  
-#define COMPUTE_DBG(sscreen, fmt, args...) \
-       do { \
-               if ((sscreen->debug_flags & DBG(COMPUTE))) fprintf(stderr, fmt, ##args); \
-       } while (0);
+#define COMPUTE_DBG(sscreen, fmt, args...)                                                         \
+   do {                                                                                            \
+      if ((sscreen->debug_flags & DBG(COMPUTE)))                                                   \
+         fprintf(stderr, fmt, ##args);                                                             \
+   } while (0);
  
  struct dispatch_packet {
-       uint16_t header;
-       uint16_t setup;
-       uint16_t workgroup_size_x;
-       uint16_t workgroup_size_y;
-       uint16_t workgroup_size_z;
-       uint16_t reserved0;
-       uint32_t grid_size_x;
-       uint32_t grid_size_y;
-       uint32_t grid_size_z;
-       uint32_t private_segment_size;
-       uint32_t group_segment_size;
-       uint64_t kernel_object;
-       uint64_t kernarg_address;
-       uint64_t reserved2;
+   uint16_t header;
+   uint16_t setup;
+   uint16_t workgroup_size_x;
+   uint16_t workgroup_size_y;
+   uint16_t workgroup_size_z;
+   uint16_t reserved0;
+   uint32_t grid_size_x;
+   uint32_t grid_size_y;
+   uint32_t grid_size_z;
+   uint32_t private_segment_size;
+   uint32_t group_segment_size;
+   uint64_t kernel_object;
+   uint64_t kernarg_address;
+   uint64_t reserved2;
  };
  
-static const amd_kernel_code_t *si_compute_get_code_object(
-       const struct si_compute *program,
-       uint64_t symbol_offset)
+static const amd_kernel_code_t *si_compute_get_code_object(const struct si_compute *program,
+                                                           uint64_t symbol_offset)
  {
-       const struct si_shader_selector *sel = &program->sel;
+   const struct si_shader_selector *sel = &program->sel;
  
-       if (program->ir_type != PIPE_SHADER_IR_NATIVE)
-               return NULL;
+   if (program->ir_type != PIPE_SHADER_IR_NATIVE)
+      return NULL;
  
-       struct ac_rtld_binary rtld;
-       if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
-                       .info = &sel->screen->info,
-                       .shader_type = MESA_SHADER_COMPUTE,
-                       .wave_size = sel->screen->compute_wave_size,
-                       .num_parts = 1,
-                       .elf_ptrs = &program->shader.binary.elf_buffer,
-                       .elf_sizes = &program->shader.binary.elf_size }))
-               return NULL;
+   struct ac_rtld_binary rtld;
+   if (!ac_rtld_open(&rtld,
+                     (struct ac_rtld_open_info){.info = &sel->screen->info,
+                                                .shader_type = MESA_SHADER_COMPUTE,
+                                                .wave_size = sel->screen->compute_wave_size,
+                                                .num_parts = 1,
+                                                .elf_ptrs = &program->shader.binary.elf_buffer,
+                                                .elf_sizes = &program->shader.binary.elf_size}))
+      return NULL;
  
-       const amd_kernel_code_t *result = NULL;
-       const char *text;
-       size_t size;
-       if (!ac_rtld_get_section_by_name(&rtld, ".text", &text, &size))
-               goto out;
+   const amd_kernel_code_t *result = NULL;
+   const char *text;
+   size_t size;
+   if (!ac_rtld_get_section_by_name(&rtld, ".text", &text, &size))
+      goto out;
  
-       if (symbol_offset + sizeof(amd_kernel_code_t) > size)
-               goto out;
+   if (symbol_offset + sizeof(amd_kernel_code_t) > size)
+      goto out;
  
-       result = (const amd_kernel_code_t*)(text + symbol_offset);
+   result = (const amd_kernel_code_t *)(text + symbol_offset);
  
  out:
-       ac_rtld_close(&rtld);
-       return result;
+   ac_rtld_close(&rtld);
+   return result;
  }
  
  static void code_object_to_config(const amd_kernel_code_t *code_object,
-                                 struct ac_shader_config *out_config) {
-
-       uint32_t rsrc1 = code_object->compute_pgm_resource_registers;
-       uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32;
-       out_config->num_sgprs = code_object->wavefront_sgpr_count;
-       out_config->num_vgprs = code_object->workitem_vgpr_count;
-       out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1);
-       out_config->rsrc1 = rsrc1;
-       out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2));
-       out_config->rsrc2 = rsrc2;
-       out_config->scratch_bytes_per_wave =
-               align(code_object->workitem_private_segment_byte_size * 64, 1024);
+                                  struct ac_shader_config *out_config)
+{
+
+   uint32_t rsrc1 = code_object->compute_pgm_resource_registers;
+   uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32;
+   out_config->num_sgprs = code_object->wavefront_sgpr_count;
+   out_config->num_vgprs = code_object->workitem_vgpr_count;
+   out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1);
+   out_config->rsrc1 = rsrc1;
+   out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2));
+   out_config->rsrc2 = rsrc2;
+   out_config->scratch_bytes_per_wave =
+      align(code_object->workitem_private_segment_byte_size * 64, 1024);
  }
  
  /* Asynchronous compute shader compilation. */
  static void si_create_compute_state_async(void *job, int thread_index)
  {
-       struct si_compute *program = (struct si_compute *)job;
-       struct si_shader_selector *sel = &program->sel;
-       struct si_shader *shader = &program->shader;
-       struct ac_llvm_compiler *compiler;
-       struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
-       struct si_screen *sscreen = sel->screen;
-
-       assert(!debug->debug_message || debug->async);
-       assert(thread_index >= 0);
-       assert(thread_index < ARRAY_SIZE(sscreen->compiler));
-       compiler = &sscreen->compiler[thread_index];
-
-       if (!compiler->passes)
-               si_init_compiler(sscreen, compiler);
-
-       assert(program->ir_type == PIPE_SHADER_IR_NIR);
-       si_nir_scan_shader(sel->nir, &sel->info);
-
-       /* Store the declared LDS size into si_shader_info for the shader
-        * cache to include it.
-        */
-       sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE] = program->local_size;
-
-       si_get_active_slot_masks(&sel->info,
-                                &sel->active_const_and_shader_buffers,
-                                &sel->active_samplers_and_images);
-
-       program->shader.is_monolithic = true;
-       program->reads_variable_block_size =
-               sel->info.uses_block_size &&
-               sel->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
-       program->num_cs_user_data_dwords =
-               sel->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
-
-       unsigned char ir_sha1_cache_key[20];
-       si_get_ir_cache_key(sel, false, false, ir_sha1_cache_key);
-
-       /* Try to load the shader from the shader cache. */
-       simple_mtx_lock(&sscreen->shader_cache_mutex);
-
-       if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
-               simple_mtx_unlock(&sscreen->shader_cache_mutex);
-
-               si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
-               si_shader_dump(sscreen, shader, debug, stderr, true);
-
-               if (!si_shader_binary_upload(sscreen, shader, 0))
-                       program->shader.compilation_failed = true;
-       } else {
-               simple_mtx_unlock(&sscreen->shader_cache_mutex);
-
-               if (!si_create_shader_variant(sscreen, compiler, &program->shader, debug)) {
-                       program->shader.compilation_failed = true;
-                       return;
-               }
-
-               bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0;
-               unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS +
-                                     (sel->info.uses_grid_size ? 3 : 0) +
-                                     (program->reads_variable_block_size ? 3 : 0) +
-                                     program->num_cs_user_data_dwords;
-
-               shader->config.rsrc1 =
-                       S_00B848_VGPRS((shader->config.num_vgprs - 1) /
-                                      (sscreen->compute_wave_size == 32 ? 8 : 4)) |
-                       S_00B848_DX10_CLAMP(1) |
-                       S_00B848_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
-                       S_00B848_WGP_MODE(sscreen->info.chip_class >= GFX10) |
-                       S_00B848_FLOAT_MODE(shader->config.float_mode);
-
-               if (sscreen->info.chip_class < GFX10) {
-                       shader->config.rsrc1 |=
-                               S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8);
-               }
-
-               shader->config.rsrc2 =
-                       S_00B84C_USER_SGPR(user_sgprs) |
-                       S_00B84C_SCRATCH_EN(scratch_enabled) |
-                       S_00B84C_TGID_X_EN(sel->info.uses_block_id[0]) |
-                       S_00B84C_TGID_Y_EN(sel->info.uses_block_id[1]) |
-                       S_00B84C_TGID_Z_EN(sel->info.uses_block_id[2]) |
-                       S_00B84C_TG_SIZE_EN(sel->info.uses_subgroup_info) |
-                       S_00B84C_TIDIG_COMP_CNT(sel->info.uses_thread_id[2] ? 2 :
-                                               sel->info.uses_thread_id[1] ? 1 : 0) |
-                       S_00B84C_LDS_SIZE(shader->config.lds_size);
-
-               simple_mtx_lock(&sscreen->shader_cache_mutex);
-               si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key,
-                                             shader, true);
-               simple_mtx_unlock(&sscreen->shader_cache_mutex);
-       }
-
-       ralloc_free(sel->nir);
-       sel->nir = NULL;
+   struct si_compute *program = (struct si_compute *)job;
+   struct si_shader_selector *sel = &program->sel;
+   struct si_shader *shader = &program->shader;
+   struct ac_llvm_compiler *compiler;
+   struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
+   struct si_screen *sscreen = sel->screen;
+
+   assert(!debug->debug_message || debug->async);
+   assert(thread_index >= 0);
+   assert(thread_index < ARRAY_SIZE(sscreen->compiler));
+   compiler = &sscreen->compiler[thread_index];
+
+   if (!compiler->passes)
+      si_init_compiler(sscreen, compiler);
+
+   assert(program->ir_type == PIPE_SHADER_IR_NIR);
+   si_nir_scan_shader(sel->nir, &sel->info);
+
+   /* Store the declared LDS size into si_shader_info for the shader
+    * cache to include it.
+    */
+   sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE] = program->local_size;
+
+   si_get_active_slot_masks(&sel->info, &sel->active_const_and_shader_buffers,
+                            &sel->active_samplers_and_images);
+
+   program->shader.is_monolithic = true;
+   program->reads_variable_block_size =
+      sel->info.uses_block_size && sel->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
+   program->num_cs_user_data_dwords =
+      sel->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
+
+   unsigned char ir_sha1_cache_key[20];
+   si_get_ir_cache_key(sel, false, false, ir_sha1_cache_key);
+
+   /* Try to load the shader from the shader cache. */
+   simple_mtx_lock(&sscreen->shader_cache_mutex);
+
+   if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
+      simple_mtx_unlock(&sscreen->shader_cache_mutex);
+
+      si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
+      si_shader_dump(sscreen, shader, debug, stderr, true);
+
+      if (!si_shader_binary_upload(sscreen, shader, 0))
+         program->shader.compilation_failed = true;
+   } else {
+      simple_mtx_unlock(&sscreen->shader_cache_mutex);
+
+      if (!si_create_shader_variant(sscreen, compiler, &program->shader, debug)) {
+         program->shader.compilation_failed = true;
+         return;
+      }
+
+      bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0;
+      unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS + (sel->info.uses_grid_size ? 3 : 0) +
+                            (program->reads_variable_block_size ? 3 : 0) +
+                            program->num_cs_user_data_dwords;
+
+      shader->config.rsrc1 = S_00B848_VGPRS((shader->config.num_vgprs - 1) /
+                                            (sscreen->compute_wave_size == 32 ? 8 : 4)) |
+                             S_00B848_DX10_CLAMP(1) |
+                             S_00B848_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+                             S_00B848_WGP_MODE(sscreen->info.chip_class >= GFX10) |
+                             S_00B848_FLOAT_MODE(shader->config.float_mode);
+
+      if (sscreen->info.chip_class < GFX10) {
+         shader->config.rsrc1 |= S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8);
+      }
+
+      shader->config.rsrc2 = S_00B84C_USER_SGPR(user_sgprs) | S_00B84C_SCRATCH_EN(scratch_enabled) |
+                             S_00B84C_TGID_X_EN(sel->info.uses_block_id[0]) |
+                             S_00B84C_TGID_Y_EN(sel->info.uses_block_id[1]) |
+                             S_00B84C_TGID_Z_EN(sel->info.uses_block_id[2]) |
+                             S_00B84C_TG_SIZE_EN(sel->info.uses_subgroup_info) |
+                             S_00B84C_TIDIG_COMP_CNT(sel->info.uses_thread_id[2]
+                                                        ? 2
+                                                        : sel->info.uses_thread_id[1] ? 1 : 0) |
+                             S_00B84C_LDS_SIZE(shader->config.lds_size);
+
+      simple_mtx_lock(&sscreen->shader_cache_mutex);
+      si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, true);
+      simple_mtx_unlock(&sscreen->shader_cache_mutex);
+   }
+
+   ralloc_free(sel->nir);
+   sel->nir = NULL;
  }
  
-static void *si_create_compute_state(
-       struct pipe_context *ctx,
-       const struct pipe_compute_state *cso)
+static void *si_create_compute_state(struct pipe_context *ctx, const struct pipe_compute_state *cso)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_screen *sscreen = (struct si_screen *)ctx->screen;
-       struct si_compute *program = CALLOC_STRUCT(si_compute);
-       struct si_shader_selector *sel = &program->sel;
-
-       pipe_reference_init(&sel->base.reference, 1);
-       sel->type = PIPE_SHADER_COMPUTE;
-       sel->screen = sscreen;
-       program->shader.selector = &program->sel;
-       program->ir_type = cso->ir_type;
-       program->local_size = cso->req_local_mem;
-       program->private_size = cso->req_private_mem;
-       program->input_size = cso->req_input_mem;
-
-       if (cso->ir_type != PIPE_SHADER_IR_NATIVE) {
-               if (cso->ir_type == PIPE_SHADER_IR_TGSI) {
-                       program->ir_type = PIPE_SHADER_IR_NIR;
-                       sel->nir = tgsi_to_nir(cso->prog, ctx->screen);
-               } else {
-                       assert(cso->ir_type == PIPE_SHADER_IR_NIR);
-                       sel->nir = (struct nir_shader *) cso->prog;
-               }
-
-               sel->compiler_ctx_state.debug = sctx->debug;
-               sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
-               p_atomic_inc(&sscreen->num_shaders_created);
-
-               si_schedule_initial_compile(sctx, PIPE_SHADER_COMPUTE,
-                                           &sel->ready,
-                                           &sel->compiler_ctx_state,
-                                           program, si_create_compute_state_async);
-       } else {
-               const struct pipe_binary_program_header *header;
-               header = cso->prog;
-
-               program->shader.binary.elf_size = header->num_bytes;
-               program->shader.binary.elf_buffer = malloc(header->num_bytes);
-               if (!program->shader.binary.elf_buffer) {
-                       FREE(program);
-                       return NULL;
-               }
-               memcpy((void *)program->shader.binary.elf_buffer, header->blob, header->num_bytes);
-
-               const amd_kernel_code_t *code_object =
-                       si_compute_get_code_object(program, 0);
-               code_object_to_config(code_object, &program->shader.config);
-
-               si_shader_dump(sctx->screen, &program->shader, &sctx->debug, stderr, true);
-               if (!si_shader_binary_upload(sctx->screen, &program->shader, 0)) {
-                       fprintf(stderr, "LLVM failed to upload shader\n");
-                       free((void *)program->shader.binary.elf_buffer);
-                       FREE(program);
-                       return NULL;
-               }
-       }
-
-       return program;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+   struct si_compute *program = CALLOC_STRUCT(si_compute);
+   struct si_shader_selector *sel = &program->sel;
+
+   pipe_reference_init(&sel->base.reference, 1);
+   sel->type = PIPE_SHADER_COMPUTE;
+   sel->screen = sscreen;
+   program->shader.selector = &program->sel;
+   program->ir_type = cso->ir_type;
+   program->local_size = cso->req_local_mem;
+   program->private_size = cso->req_private_mem;
+   program->input_size = cso->req_input_mem;
+
+   if (cso->ir_type != PIPE_SHADER_IR_NATIVE) {
+      if (cso->ir_type == PIPE_SHADER_IR_TGSI) {
+         program->ir_type = PIPE_SHADER_IR_NIR;
+         sel->nir = tgsi_to_nir(cso->prog, ctx->screen);
+      } else {
+         assert(cso->ir_type == PIPE_SHADER_IR_NIR);
+         sel->nir = (struct nir_shader *)cso->prog;
+      }
+
+      sel->compiler_ctx_state.debug = sctx->debug;
+      sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
+      p_atomic_inc(&sscreen->num_shaders_created);
+
+      si_schedule_initial_compile(sctx, PIPE_SHADER_COMPUTE, &sel->ready, &sel->compiler_ctx_state,
+                                  program, si_create_compute_state_async);
+   } else {
+      const struct pipe_binary_program_header *header;
+      header = cso->prog;
+
+      program->shader.binary.elf_size = header->num_bytes;
+      program->shader.binary.elf_buffer = malloc(header->num_bytes);
+      if (!program->shader.binary.elf_buffer) {
+         FREE(program);
+         return NULL;
+      }
+      memcpy((void *)program->shader.binary.elf_buffer, header->blob, header->num_bytes);
+
+      const amd_kernel_code_t *code_object = si_compute_get_code_object(program, 0);
+      code_object_to_config(code_object, &program->shader.config);
+
+      si_shader_dump(sctx->screen, &program->shader, &sctx->debug, stderr, true);
+      if (!si_shader_binary_upload(sctx->screen, &program->shader, 0)) {
+         fprintf(stderr, "LLVM failed to upload shader\n");
+         free((void *)program->shader.binary.elf_buffer);
+         FREE(program);
+         return NULL;
+      }
+   }
+
+   return program;
  }
  
  static void si_bind_compute_state(struct pipe_context *ctx, void *state)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_compute *program = (struct si_compute*)state;
-       struct si_shader_selector *sel = &program->sel;
-
-       sctx->cs_shader_state.program = program;
-       if (!program)
-               return;
-
-       /* Wait because we need active slot usage masks. */
-       if (program->ir_type != PIPE_SHADER_IR_NATIVE)
-               util_queue_fence_wait(&sel->ready);
-
-       si_set_active_descriptors(sctx,
-                                 SI_DESCS_FIRST_COMPUTE +
-                                 SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
-                                 sel->active_const_and_shader_buffers);
-       si_set_active_descriptors(sctx,
-                                 SI_DESCS_FIRST_COMPUTE +
-                                 SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
-                                 sel->active_samplers_and_images);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_compute *program = (struct si_compute *)state;
+   struct si_shader_selector *sel = &program->sel;
+
+   sctx->cs_shader_state.program = program;
+   if (!program)
+      return;
+
+   /* Wait because we need active slot usage masks. */
+   if (program->ir_type != PIPE_SHADER_IR_NATIVE)
+      util_queue_fence_wait(&sel->ready);
+
+   si_set_active_descriptors(sctx,
+                             SI_DESCS_FIRST_COMPUTE + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+                             sel->active_const_and_shader_buffers);
+   si_set_active_descriptors(sctx, SI_DESCS_FIRST_COMPUTE + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+                             sel->active_samplers_and_images);
  }
  
-static void si_set_global_binding(
-       struct pipe_context *ctx, unsigned first, unsigned n,
-       struct pipe_resource **resources,
-       uint32_t **handles)
+static void si_set_global_binding(struct pipe_context *ctx, unsigned first, unsigned n,
+                                  struct pipe_resource **resources, uint32_t **handles)
  {
-       unsigned i;
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_compute *program = sctx->cs_shader_state.program;
-
-       if (first + n > program->max_global_buffers) {
-               unsigned old_max = program->max_global_buffers;
-               program->max_global_buffers = first + n;
-               program->global_buffers =
-                       realloc(program->global_buffers,
-                               program->max_global_buffers *
-                               sizeof(program->global_buffers[0]));
-               if (!program->global_buffers) {
-                       fprintf(stderr, "radeonsi: failed to allocate compute global_buffers\n");
-                       return;
-               }
-
-               memset(&program->global_buffers[old_max], 0,
-                      (program->max_global_buffers - old_max) *
-                      sizeof(program->global_buffers[0]));
-       }
-
-       if (!resources) {
-               for (i = 0; i < n; i++) {
-                       pipe_resource_reference(&program->global_buffers[first + i], NULL);
-               }
-               return;
-       }
-
-       for (i = 0; i < n; i++) {
-               uint64_t va;
-               uint32_t offset;
-               pipe_resource_reference(&program->global_buffers[first + i], resources[i]);
-               va = si_resource(resources[i])->gpu_address;
-               offset = util_le32_to_cpu(*handles[i]);
-               va += offset;
-               va = util_cpu_to_le64(va);
-               memcpy(handles[i], &va, sizeof(va));
-       }
+   unsigned i;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_compute *program = sctx->cs_shader_state.program;
+
+   if (first + n > program->max_global_buffers) {
+      unsigned old_max = program->max_global_buffers;
+      program->max_global_buffers = first + n;
+      program->global_buffers = realloc(
+         program->global_buffers, program->max_global_buffers * sizeof(program->global_buffers[0]));
+      if (!program->global_buffers) {
+         fprintf(stderr, "radeonsi: failed to allocate compute global_buffers\n");
+         return;
+      }
+
+      memset(&program->global_buffers[old_max], 0,
+             (program->max_global_buffers - old_max) * sizeof(program->global_buffers[0]));
+   }
+
+   if (!resources) {
+      for (i = 0; i < n; i++) {
+         pipe_resource_reference(&program->global_buffers[first + i], NULL);
+      }
+      return;
+   }
+
+   for (i = 0; i < n; i++) {
+      uint64_t va;
+      uint32_t offset;
+      pipe_resource_reference(&program->global_buffers[first + i], resources[i]);
+      va = si_resource(resources[i])->gpu_address;
+      offset = util_le32_to_cpu(*handles[i]);
+      va += offset;
+      va = util_cpu_to_le64(va);
+      memcpy(handles[i], &va, sizeof(va));
+   }
  }
  
  void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs)
  {
-       uint64_t bc_va;
-
-       radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
-       /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1,
-        * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */
-       radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
-       radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
-
-       if (sctx->chip_class >= GFX7) {
-               /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
-               radeon_set_sh_reg_seq(cs,
-                                    R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
-               radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) |
-                               S_00B858_SH1_CU_EN(0xffff));
-               radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) |
-                               S_00B858_SH1_CU_EN(0xffff));
-       }
-
-       if (sctx->chip_class >= GFX10)
-               radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
-
-       /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
-        * and is now per pipe, so it should be handled in the
-        * kernel if we want to use something other than the default value,
-        * which is now 0x22f.
-        */
-       if (sctx->chip_class <= GFX6) {
-               /* XXX: This should be:
-                * (number of compute units) * 4 * (waves per simd) - 1 */
-
-               radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID,
-                                 0x190 /* Default value */);
-       }
-
-       /* Set the pointer to border colors. */
-       bc_va = sctx->border_color_buffer->gpu_address;
-
-       if (sctx->chip_class >= GFX7) {
-               radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2);
-               radeon_emit(cs, bc_va >> 8);  /* R_030E00_TA_CS_BC_BASE_ADDR */
-               radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
-       } else {
-               if (sctx->screen->info.si_TA_CS_BC_BASE_ADDR_allowed) {
-                       radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR,
-                                             bc_va >> 8);
-               }
-       }
+   uint64_t bc_va;
+
+   radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
+   /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1,
+    * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */
+   radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+   radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+
+   if (sctx->chip_class >= GFX7) {
+      /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
+      radeon_set_sh_reg_seq(cs, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
+      radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+      radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+   }
+
+   if (sctx->chip_class >= GFX10)
+      radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
+
+   /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
+    * and is now per pipe, so it should be handled in the
+    * kernel if we want to use something other than the default value,
+    * which is now 0x22f.
+    */
+   if (sctx->chip_class <= GFX6) {
+      /* XXX: This should be:
+       * (number of compute units) * 4 * (waves per simd) - 1 */
+
+      radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */);
+   }
+
+   /* Set the pointer to border colors. */
+   bc_va = sctx->border_color_buffer->gpu_address;
+
+   if (sctx->chip_class >= GFX7) {
+      radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2);
+      radeon_emit(cs, bc_va >> 8);                    /* R_030E00_TA_CS_BC_BASE_ADDR */
+      radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
+   } else {
+      if (sctx->screen->info.si_TA_CS_BC_BASE_ADDR_allowed) {
+         radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8);
+      }
+   }
  }
  
-static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
-                                            struct si_shader *shader,
+static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader,
                                              struct ac_shader_config *config)
  {
-       uint64_t scratch_bo_size, scratch_needed;
-       scratch_bo_size = 0;
-       scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves;
-       if (sctx->compute_scratch_buffer)
-               scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
+   uint64_t scratch_bo_size, scratch_needed;
+   scratch_bo_size = 0;
+   scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves;
+   if (sctx->compute_scratch_buffer)
+      scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
  
-       if (scratch_bo_size < scratch_needed) {
-               si_resource_reference(&sctx->compute_scratch_buffer, NULL);
+   if (scratch_bo_size < scratch_needed) {
+      si_resource_reference(&sctx->compute_scratch_buffer, NULL);
  
-               sctx->compute_scratch_buffer =
-                       si_aligned_buffer_create(&sctx->screen->b,
-                                                SI_RESOURCE_FLAG_UNMAPPABLE,
-                                                PIPE_USAGE_DEFAULT,
-                                                scratch_needed,
-                                                sctx->screen->info.pte_fragment_size);
+      sctx->compute_scratch_buffer =
+         si_aligned_buffer_create(&sctx->screen->b, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+                                  scratch_needed, sctx->screen->info.pte_fragment_size);
  
-               if (!sctx->compute_scratch_buffer)
-                       return false;
-       }
+      if (!sctx->compute_scratch_buffer)
+         return false;
+   }
  
-       if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) {
-               uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
+   if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) {
+      uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
  
-               if (!si_shader_binary_upload(sctx->screen, shader, scratch_va))
-                       return false;
+      if (!si_shader_binary_upload(sctx->screen, shader, scratch_va))
+         return false;
  
-               si_resource_reference(&shader->scratch_bo,
-                                       sctx->compute_scratch_buffer);
-       }
+      si_resource_reference(&shader->scratch_bo, sctx->compute_scratch_buffer);
+   }
  
-       return true;
+   return true;
  }
  
-static bool si_switch_compute_shader(struct si_context *sctx,
-                                     struct si_compute *program,
-                                    struct si_shader *shader,
-                                    const amd_kernel_code_t *code_object,
-                                    unsigned offset)
+static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute *program,
+                                     struct si_shader *shader, const amd_kernel_code_t *code_object,
+                                     unsigned offset)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       struct ac_shader_config inline_config = {0};
-       struct ac_shader_config *config;
-       uint64_t shader_va;
-
-       if (sctx->cs_shader_state.emitted_program == program &&
-           sctx->cs_shader_state.offset == offset)
-               return true;
-
-       if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
-               config = &shader->config;
-       } else {
-               unsigned lds_blocks;
-
-               config = &inline_config;
-               code_object_to_config(code_object, config);
-
-               lds_blocks = config->lds_size;
-               /* XXX: We are over allocating LDS.  For GFX6, the shader reports
-               * LDS in blocks of 256 bytes, so if there are 4 bytes lds
-               * allocated in the shader and 4 bytes allocated by the state
-               * tracker, then we will set LDS_SIZE to 512 bytes rather than 256.
-               */
-               if (sctx->chip_class <= GFX6) {
-                       lds_blocks += align(program->local_size, 256) >> 8;
-               } else {
-                       lds_blocks += align(program->local_size, 512) >> 9;
-               }
-
-               /* TODO: use si_multiwave_lds_size_workaround */
-               assert(lds_blocks <= 0xFF);
-
-               config->rsrc2 &= C_00B84C_LDS_SIZE;
-               config->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
-       }
-
-       if (!si_setup_compute_scratch_buffer(sctx, shader, config))
-               return false;
-
-       if (shader->scratch_bo) {
-               COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; "
-                           "Total Scratch: %u bytes\n", sctx->scratch_waves,
-                           config->scratch_bytes_per_wave,
-                           config->scratch_bytes_per_wave *
-                           sctx->scratch_waves);
-
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                             shader->scratch_bo, RADEON_USAGE_READWRITE,
-                             RADEON_PRIO_SCRATCH_BUFFER);
-       }
-
-       /* Prefetch the compute shader to TC L2.
-        *
-        * We should also prefetch graphics shaders if a compute dispatch was
-        * the last command, and the compute shader if a draw call was the last
-        * command. However, that would add more complexity and we're likely
-        * to get a shader state change in that case anyway.
-        */
-       if (sctx->chip_class >= GFX7) {
-               cik_prefetch_TC_L2_async(sctx, &program->shader.bo->b.b,
-                                        0, program->shader.bo->b.b.width0);
-       }
-
-       shader_va = shader->bo->gpu_address + offset;
-       if (program->ir_type == PIPE_SHADER_IR_NATIVE) {
-               /* Shader code is placed after the amd_kernel_code_t
-                * struct. */
-               shader_va += sizeof(amd_kernel_code_t);
-       }
-
-       radeon_add_to_buffer_list(sctx, sctx->gfx_cs, shader->bo,
-                                 RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
-       radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
-       radeon_emit(cs, shader_va >> 8);
-       radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
-
-       radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
-       radeon_emit(cs, config->rsrc1);
-       radeon_emit(cs, config->rsrc2);
-
-       COMPUTE_DBG(sctx->screen, "COMPUTE_PGM_RSRC1: 0x%08x "
-               "COMPUTE_PGM_RSRC2: 0x%08x\n", config->rsrc1, config->rsrc2);
-
-       sctx->max_seen_compute_scratch_bytes_per_wave =
-               MAX2(sctx->max_seen_compute_scratch_bytes_per_wave,
-                    config->scratch_bytes_per_wave);
-
-       radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
-                 S_00B860_WAVES(sctx->scratch_waves)
-                    | S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10));
-
-       sctx->cs_shader_state.emitted_program = program;
-       sctx->cs_shader_state.offset = offset;
-       sctx->cs_shader_state.uses_scratch =
-               config->scratch_bytes_per_wave != 0;
-
-       return true;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct ac_shader_config inline_config = {0};
+   struct ac_shader_config *config;
+   uint64_t shader_va;
+
+   if (sctx->cs_shader_state.emitted_program == program && sctx->cs_shader_state.offset == offset)
+      return true;
+
+   if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
+      config = &shader->config;
+   } else {
+      unsigned lds_blocks;
+
+      config = &inline_config;
+      code_object_to_config(code_object, config);
+
+      lds_blocks = config->lds_size;
+      /* XXX: We are over allocating LDS.  For GFX6, the shader reports
+       * LDS in blocks of 256 bytes, so if there are 4 bytes lds
+       * allocated in the shader and 4 bytes allocated by the state
+       * tracker, then we will set LDS_SIZE to 512 bytes rather than 256.
+       */
+      if (sctx->chip_class <= GFX6) {
+         lds_blocks += align(program->local_size, 256) >> 8;
+      } else {
+         lds_blocks += align(program->local_size, 512) >> 9;
+      }
+
+      /* TODO: use si_multiwave_lds_size_workaround */
+      assert(lds_blocks <= 0xFF);
+
+      config->rsrc2 &= C_00B84C_LDS_SIZE;
+      config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks);
+   }
+
+   if (!si_setup_compute_scratch_buffer(sctx, shader, config))
+      return false;
+
+   if (shader->scratch_bo) {
+      COMPUTE_DBG(sctx->screen,
+                  "Waves: %u; Scratch per wave: %u bytes; "
+                  "Total Scratch: %u bytes\n",
+                  sctx->scratch_waves, config->scratch_bytes_per_wave,
+                  config->scratch_bytes_per_wave * sctx->scratch_waves);
+
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, shader->scratch_bo, RADEON_USAGE_READWRITE,
+                                RADEON_PRIO_SCRATCH_BUFFER);
+   }
+
+   /* Prefetch the compute shader to TC L2.
+    *
+    * We should also prefetch graphics shaders if a compute dispatch was
+    * the last command, and the compute shader if a draw call was the last
+    * command. However, that would add more complexity and we're likely
+    * to get a shader state change in that case anyway.
+    */
+   if (sctx->chip_class >= GFX7) {
+      cik_prefetch_TC_L2_async(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0);
+   }
+
+   shader_va = shader->bo->gpu_address + offset;
+   if (program->ir_type == PIPE_SHADER_IR_NATIVE) {
+      /* Shader code is placed after the amd_kernel_code_t
+       * struct. */
+      shader_va += sizeof(amd_kernel_code_t);
+   }
+
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, shader->bo, RADEON_USAGE_READ,
+                             RADEON_PRIO_SHADER_BINARY);
+
+   radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
+   radeon_emit(cs, shader_va >> 8);
+   radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
+
+   radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+   radeon_emit(cs, config->rsrc1);
+   radeon_emit(cs, config->rsrc2);
+
+   COMPUTE_DBG(sctx->screen,
+               "COMPUTE_PGM_RSRC1: 0x%08x "
+               "COMPUTE_PGM_RSRC2: 0x%08x\n",
+               config->rsrc1, config->rsrc2);
+
+   sctx->max_seen_compute_scratch_bytes_per_wave =
+      MAX2(sctx->max_seen_compute_scratch_bytes_per_wave, config->scratch_bytes_per_wave);
+
+   radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
+                     S_00B860_WAVES(sctx->scratch_waves) |
+                        S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10));
+
+   sctx->cs_shader_state.emitted_program = program;
+   sctx->cs_shader_state.offset = offset;
+   sctx->cs_shader_state.uses_scratch = config->scratch_bytes_per_wave != 0;
+
+   return true;
  }
  
  static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
-                                         const amd_kernel_code_t *code_object,
-                                         unsigned user_sgpr)
+                                          const amd_kernel_code_t *code_object, unsigned user_sgpr)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
-
-       unsigned max_private_element_size = AMD_HSA_BITS_GET(
-                       code_object->code_properties,
-                       AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE);
-
-       uint32_t scratch_dword0 = scratch_va & 0xffffffff;
-       uint32_t scratch_dword1 =
-               S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
-               S_008F04_SWIZZLE_ENABLE(1);
-
-       /* Disable address clamping */
-       uint32_t scratch_dword2 = 0xffffffff;
-       uint32_t scratch_dword3 =
-               S_008F0C_INDEX_STRIDE(3) |
-               S_008F0C_ADD_TID_ENABLE(1);
-
-       if (sctx->chip_class >= GFX9) {
-               assert(max_private_element_size == 1); /* always 4 bytes on GFX9 */
-       } else {
-               scratch_dword3 |= S_008F0C_ELEMENT_SIZE(max_private_element_size);
-
-               if (sctx->chip_class < GFX8) {
-                       /* BUF_DATA_FORMAT is ignored, but it cannot be
-                        * BUF_DATA_FORMAT_INVALID. */
-                       scratch_dword3 |=
-                               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8);
-               }
-       }
-
-       radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
-                                                       (user_sgpr * 4), 4);
-       radeon_emit(cs, scratch_dword0);
-       radeon_emit(cs, scratch_dword1);
-       radeon_emit(cs, scratch_dword2);
-       radeon_emit(cs, scratch_dword3);
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
+
+   unsigned max_private_element_size =
+      AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE);
+
+   uint32_t scratch_dword0 = scratch_va & 0xffffffff;
+   uint32_t scratch_dword1 =
+      S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1);
+
+   /* Disable address clamping */
+   uint32_t scratch_dword2 = 0xffffffff;
+   uint32_t scratch_dword3 = S_008F0C_INDEX_STRIDE(3) | S_008F0C_ADD_TID_ENABLE(1);
+
+   if (sctx->chip_class >= GFX9) {
+      assert(max_private_element_size == 1); /* always 4 bytes on GFX9 */
+   } else {
+      scratch_dword3 |= S_008F0C_ELEMENT_SIZE(max_private_element_size);
+
+      if (sctx->chip_class < GFX8) {
+         /* BUF_DATA_FORMAT is ignored, but it cannot be
+          * BUF_DATA_FORMAT_INVALID. */
+         scratch_dword3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8);
+      }
+   }
+
+   radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4);
+   radeon_emit(cs, scratch_dword0);
+   radeon_emit(cs, scratch_dword1);
+   radeon_emit(cs, scratch_dword2);
+   radeon_emit(cs, scratch_dword3);
  }
  
-static void si_setup_user_sgprs_co_v2(struct si_context *sctx,
-                                      const amd_kernel_code_t *code_object,
-                                     const struct pipe_grid_info *info,
-                                     uint64_t kernel_args_va)
+static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_code_t *code_object,
+                                      const struct pipe_grid_info *info, uint64_t kernel_args_va)
  {
-       struct si_compute *program = sctx->cs_shader_state.program;
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-       static const enum amd_code_property_mask_t workgroup_count_masks [] = {
-               AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X,
-               AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y,
-               AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z
-       };
-
-       unsigned i, user_sgpr = 0;
-       if (AMD_HSA_BITS_GET(code_object->code_properties,
-                       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
-               if (code_object->workitem_private_segment_byte_size > 0) {
-                       setup_scratch_rsrc_user_sgprs(sctx, code_object,
-                                                               user_sgpr);
-               }
-               user_sgpr += 4;
-       }
-
-       if (AMD_HSA_BITS_GET(code_object->code_properties,
-                       AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) {
-               struct dispatch_packet dispatch;
-               unsigned dispatch_offset;
-               struct si_resource *dispatch_buf = NULL;
-               uint64_t dispatch_va;
-
-               /* Upload dispatch ptr */
-               memset(&dispatch, 0, sizeof(dispatch));
-
-               dispatch.workgroup_size_x = util_cpu_to_le16(info->block[0]);
-               dispatch.workgroup_size_y = util_cpu_to_le16(info->block[1]);
-               dispatch.workgroup_size_z = util_cpu_to_le16(info->block[2]);
-
-               dispatch.grid_size_x = util_cpu_to_le32(info->grid[0] * info->block[0]);
-               dispatch.grid_size_y = util_cpu_to_le32(info->grid[1] * info->block[1]);
-               dispatch.grid_size_z = util_cpu_to_le32(info->grid[2] * info->block[2]);
-
-               dispatch.private_segment_size = util_cpu_to_le32(program->private_size);
-               dispatch.group_segment_size = util_cpu_to_le32(program->local_size);
-
-               dispatch.kernarg_address = util_cpu_to_le64(kernel_args_va);
-
-               u_upload_data(sctx->b.const_uploader, 0, sizeof(dispatch),
-                              256, &dispatch, &dispatch_offset,
-                              (struct pipe_resource**)&dispatch_buf);
-
-               if (!dispatch_buf) {
-                       fprintf(stderr, "Error: Failed to allocate dispatch "
-                                       "packet.");
-               }
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dispatch_buf,
-                                 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
-
-               dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
-
-               radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
-                                                       (user_sgpr * 4), 2);
-               radeon_emit(cs, dispatch_va);
-               radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) |
-                                S_008F04_STRIDE(0));
-
-               si_resource_reference(&dispatch_buf, NULL);
-               user_sgpr += 2;
-       }
-
-       if (AMD_HSA_BITS_GET(code_object->code_properties,
-                       AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
-               radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
-                                                       (user_sgpr * 4), 2);
-               radeon_emit(cs, kernel_args_va);
-               radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
-                               S_008F04_STRIDE(0));
-               user_sgpr += 2;
-       }
-
-       for (i = 0; i < 3 && user_sgpr < 16; i++) {
-               if (code_object->code_properties & workgroup_count_masks[i]) {
-                       radeon_set_sh_reg_seq(cs,
-                               R_00B900_COMPUTE_USER_DATA_0 +
-                               (user_sgpr * 4), 1);
-                       radeon_emit(cs, info->grid[i]);
-                       user_sgpr += 1;
-               }
-       }
+   struct si_compute *program = sctx->cs_shader_state.program;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   static const enum amd_code_property_mask_t workgroup_count_masks[] = {
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X,
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y,
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z};
+
+   unsigned i, user_sgpr = 0;
+   if (AMD_HSA_BITS_GET(code_object->code_properties,
+                        AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
+      if (code_object->workitem_private_segment_byte_size > 0) {
+         setup_scratch_rsrc_user_sgprs(sctx, code_object, user_sgpr);
+      }
+      user_sgpr += 4;
+   }
+
+   if (AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) {
+      struct dispatch_packet dispatch;
+      unsigned dispatch_offset;
+      struct si_resource *dispatch_buf = NULL;
+      uint64_t dispatch_va;
+
+      /* Upload dispatch ptr */
+      memset(&dispatch, 0, sizeof(dispatch));
+
+      dispatch.workgroup_size_x = util_cpu_to_le16(info->block[0]);
+      dispatch.workgroup_size_y = util_cpu_to_le16(info->block[1]);
+      dispatch.workgroup_size_z = util_cpu_to_le16(info->block[2]);
+
+      dispatch.grid_size_x = util_cpu_to_le32(info->grid[0] * info->block[0]);
+      dispatch.grid_size_y = util_cpu_to_le32(info->grid[1] * info->block[1]);
+      dispatch.grid_size_z = util_cpu_to_le32(info->grid[2] * info->block[2]);
+
+      dispatch.private_segment_size = util_cpu_to_le32(program->private_size);
+      dispatch.group_segment_size = util_cpu_to_le32(program->local_size);
+
+      dispatch.kernarg_address = util_cpu_to_le64(kernel_args_va);
+
+      u_upload_data(sctx->b.const_uploader, 0, sizeof(dispatch), 256, &dispatch, &dispatch_offset,
+                    (struct pipe_resource **)&dispatch_buf);
+
+      if (!dispatch_buf) {
+         fprintf(stderr, "Error: Failed to allocate dispatch "
+                         "packet.");
+      }
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dispatch_buf, RADEON_USAGE_READ,
+                                RADEON_PRIO_CONST_BUFFER);
+
+      dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
+
+      radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
+      radeon_emit(cs, dispatch_va);
+      radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0));
+
+      si_resource_reference(&dispatch_buf, NULL);
+      user_sgpr += 2;
+   }
+
+   if (AMD_HSA_BITS_GET(code_object->code_properties,
+                        AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
+      radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
+      radeon_emit(cs, kernel_args_va);
+      radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0));
+      user_sgpr += 2;
+   }
+
+   for (i = 0; i < 3 && user_sgpr < 16; i++) {
+      if (code_object->code_properties & workgroup_count_masks[i]) {
+         radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1);
+         radeon_emit(cs, info->grid[i]);
+         user_sgpr += 1;
+      }
+   }
  }
  
-static bool si_upload_compute_input(struct si_context *sctx,
-                                   const amd_kernel_code_t *code_object,
-                                   const struct pipe_grid_info *info)
+static bool si_upload_compute_input(struct si_context *sctx, const amd_kernel_code_t *code_object,
+                                    const struct pipe_grid_info *info)
  {
-       struct si_compute *program = sctx->cs_shader_state.program;
-       struct si_resource *input_buffer = NULL;
-       uint32_t kernel_args_offset = 0;
-       uint32_t *kernel_args;
-       void *kernel_args_ptr;
-       uint64_t kernel_args_va;
+   struct si_compute *program = sctx->cs_shader_state.program;
+   struct si_resource *input_buffer = NULL;
+   uint32_t kernel_args_offset = 0;
+   uint32_t *kernel_args;
+   void *kernel_args_ptr;
+   uint64_t kernel_args_va;
  
-       u_upload_alloc(sctx->b.const_uploader, 0, program->input_size,
-                      sctx->screen->info.tcc_cache_line_size,
-                      &kernel_args_offset,
-                      (struct pipe_resource**)&input_buffer, &kernel_args_ptr);
+   u_upload_alloc(sctx->b.const_uploader, 0, program->input_size,
+                  sctx->screen->info.tcc_cache_line_size, &kernel_args_offset,
+                  (struct pipe_resource **)&input_buffer, &kernel_args_ptr);
  
-       if (unlikely(!kernel_args_ptr))
-               return false;
+   if (unlikely(!kernel_args_ptr))
+      return false;
  
-       kernel_args = (uint32_t*)kernel_args_ptr;
-       kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
+   kernel_args = (uint32_t *)kernel_args_ptr;
+   kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
  
-       memcpy(kernel_args, info->input, program->input_size);
+   memcpy(kernel_args, info->input, program->input_size);
  
-       for (unsigned i = 0; i < program->input_size / 4; i++) {
-               COMPUTE_DBG(sctx->screen, "input %u : %u\n", i,
-                       kernel_args[i]);
-       }
+   for (unsigned i = 0; i < program->input_size / 4; i++) {
+      COMPUTE_DBG(sctx->screen, "input %u : %u\n", i, kernel_args[i]);
+   }
  
-       radeon_add_to_buffer_list(sctx, sctx->gfx_cs, input_buffer,
-                                 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, input_buffer, RADEON_USAGE_READ,
+                             RADEON_PRIO_CONST_BUFFER);
  
-       si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va);
-       si_resource_reference(&input_buffer, NULL);
-       return true;
+   si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va);
+   si_resource_reference(&input_buffer, NULL);
+   return true;
  }
  
-static void si_setup_nir_user_data(struct si_context *sctx,
-                                  const struct pipe_grid_info *info)
+static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_grid_info *info)
  {
-       struct si_compute *program = sctx->cs_shader_state.program;
-       struct si_shader_selector *sel = &program->sel;
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 +
-                                4 * SI_NUM_RESOURCE_SGPRS;
-       unsigned block_size_reg = grid_size_reg +
-                                 /* 12 bytes = 3 dwords. */
-                                 12 * sel->info.uses_grid_size;
-       unsigned cs_user_data_reg = block_size_reg +
-                                   12 * program->reads_variable_block_size;
-
-       if (info->indirect) {
-               if (sel->info.uses_grid_size) {
-                       for (unsigned i = 0; i < 3; ++i) {
-                               si_cp_copy_data(sctx, sctx->gfx_cs,
-                                               COPY_DATA_REG, NULL, (grid_size_reg >> 2) + i,
-                                               COPY_DATA_SRC_MEM, si_resource(info->indirect),
-                                               info->indirect_offset + 4 * i);
-                       }
-               }
-       } else {
-               if (sel->info.uses_grid_size) {
-                       radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
-                       radeon_emit(cs, info->grid[0]);
-                       radeon_emit(cs, info->grid[1]);
-                       radeon_emit(cs, info->grid[2]);
-               }
-               if (program->reads_variable_block_size) {
-                       radeon_set_sh_reg_seq(cs, block_size_reg, 3);
-                       radeon_emit(cs, info->block[0]);
-                       radeon_emit(cs, info->block[1]);
-                       radeon_emit(cs, info->block[2]);
-               }
-       }
-
-       if (program->num_cs_user_data_dwords) {
-               radeon_set_sh_reg_seq(cs, cs_user_data_reg, program->num_cs_user_data_dwords);
-               radeon_emit_array(cs, sctx->cs_user_data, program->num_cs_user_data_dwords);
-       }
+   struct si_compute *program = sctx->cs_shader_state.program;
+   struct si_shader_selector *sel = &program->sel;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 + 4 * SI_NUM_RESOURCE_SGPRS;
+   unsigned block_size_reg = grid_size_reg +
+                             /* 12 bytes = 3 dwords. */
+                             12 * sel->info.uses_grid_size;
+   unsigned cs_user_data_reg = block_size_reg + 12 * program->reads_variable_block_size;
+
+   if (info->indirect) {
+      if (sel->info.uses_grid_size) {
+         for (unsigned i = 0; i < 3; ++i) {
+            si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_REG, NULL, (grid_size_reg >> 2) + i,
+                            COPY_DATA_SRC_MEM, si_resource(info->indirect),
+                            info->indirect_offset + 4 * i);
+         }
+      }
+   } else {
+      if (sel->info.uses_grid_size) {
+         radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
+         radeon_emit(cs, info->grid[0]);
+         radeon_emit(cs, info->grid[1]);
+         radeon_emit(cs, info->grid[2]);
+      }
+      if (program->reads_variable_block_size) {
+         radeon_set_sh_reg_seq(cs, block_size_reg, 3);
+         radeon_emit(cs, info->block[0]);
+         radeon_emit(cs, info->block[1]);
+         radeon_emit(cs, info->block[2]);
+      }
+   }
+
+   if (program->num_cs_user_data_dwords) {
+      radeon_set_sh_reg_seq(cs, cs_user_data_reg, program->num_cs_user_data_dwords);
+      radeon_emit_array(cs, sctx->cs_user_data, program->num_cs_user_data_dwords);
+   }
  }
  
-static void si_emit_dispatch_packets(struct si_context *sctx,
-                                     const struct pipe_grid_info *info)
+static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_grid_info *info)
  {
-       struct si_screen *sscreen = sctx->screen;
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
-       unsigned threads_per_threadgroup =
-               info->block[0] * info->block[1] * info->block[2];
-       unsigned waves_per_threadgroup =
-               DIV_ROUND_UP(threads_per_threadgroup, sscreen->compute_wave_size);
-       unsigned threadgroups_per_cu = 1;
-
-       if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1)
-               threadgroups_per_cu = 2;
-
-       radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
-                         ac_get_compute_resource_limits(&sscreen->info,
-                                                        waves_per_threadgroup,
-                                                        sctx->cs_max_waves_per_sh,
-                                                        threadgroups_per_cu));
-
-       unsigned dispatch_initiator =
-               S_00B800_COMPUTE_SHADER_EN(1) |
-               S_00B800_FORCE_START_AT_000(1) |
-               /* If the KMD allows it (there is a KMD hw register for it),
-                * allow launching waves out-of-order. (same as Vulkan) */
-               S_00B800_ORDER_MODE(sctx->chip_class >= GFX7) |
-               S_00B800_CS_W32_EN(sscreen->compute_wave_size == 32);
-
-       const uint *last_block = info->last_block;
-       bool partial_block_en = last_block[0] || last_block[1] || last_block[2];
-
-       radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
-
-       if (partial_block_en) {
-               unsigned partial[3];
-
-               /* If no partial_block, these should be an entire block size, not 0. */
-               partial[0] = last_block[0] ? last_block[0] : info->block[0];
-               partial[1] = last_block[1] ? last_block[1] : info->block[1];
-               partial[2] = last_block[2] ? last_block[2] : info->block[2];
-
-               radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) |
-                               S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
-               radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]) |
-                               S_00B820_NUM_THREAD_PARTIAL(partial[1]));
-               radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]) |
-                               S_00B824_NUM_THREAD_PARTIAL(partial[2]));
-
-               dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
-       } else {
-               radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
-               radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
-               radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
-       }
-
-       if (info->indirect) {
-               uint64_t base_va = si_resource(info->indirect)->gpu_address;
-
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                si_resource(info->indirect),
-                                RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
-
-               radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
-                               PKT3_SHADER_TYPE_S(1));
-               radeon_emit(cs, 1);
-               radeon_emit(cs, base_va);
-               radeon_emit(cs, base_va >> 32);
-
-               radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) |
-                               PKT3_SHADER_TYPE_S(1));
-               radeon_emit(cs, info->indirect_offset);
-               radeon_emit(cs, dispatch_initiator);
-       } else {
-               radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) |
-                               PKT3_SHADER_TYPE_S(1));
-               radeon_emit(cs, info->grid[0]);
-               radeon_emit(cs, info->grid[1]);
-               radeon_emit(cs, info->grid[2]);
-               radeon_emit(cs, dispatch_initiator);
-       }
+   struct si_screen *sscreen = sctx->screen;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
+   unsigned threads_per_threadgroup = info->block[0] * info->block[1] * info->block[2];
+   unsigned waves_per_threadgroup =
+      DIV_ROUND_UP(threads_per_threadgroup, sscreen->compute_wave_size);
+   unsigned threadgroups_per_cu = 1;
+
+   if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1)
+      threadgroups_per_cu = 2;
+
+   radeon_set_sh_reg(
+      cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+      ac_get_compute_resource_limits(&sscreen->info, waves_per_threadgroup,
+                                     sctx->cs_max_waves_per_sh, threadgroups_per_cu));
+
+   unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_FORCE_START_AT_000(1) |
+                                 /* If the KMD allows it (there is a KMD hw register for it),
+                                  * allow launching waves out-of-order. (same as Vulkan) */
+                                 S_00B800_ORDER_MODE(sctx->chip_class >= GFX7) |
+                                 S_00B800_CS_W32_EN(sscreen->compute_wave_size == 32);
+
+   const uint *last_block = info->last_block;
+   bool partial_block_en = last_block[0] || last_block[1] || last_block[2];
+
+   radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+
+   if (partial_block_en) {
+      unsigned partial[3];
+
+      /* If no partial_block, these should be an entire block size, not 0. */
+      partial[0] = last_block[0] ? last_block[0] : info->block[0];
+      partial[1] = last_block[1] ? last_block[1] : info->block[1];
+      partial[2] = last_block[2] ? last_block[2] : info->block[2];
+
+      radeon_emit(
+         cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) | S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
+      radeon_emit(
+         cs, S_00B820_NUM_THREAD_FULL(info->block[1]) | S_00B820_NUM_THREAD_PARTIAL(partial[1]));
+      radeon_emit(
+         cs, S_00B824_NUM_THREAD_FULL(info->block[2]) | S_00B824_NUM_THREAD_PARTIAL(partial[2]));
+
+      dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
+   } else {
+      radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
+      radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
+      radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
+   }
+
+   if (info->indirect) {
+      uint64_t base_va = si_resource(info->indirect)->gpu_address;
+
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(info->indirect), RADEON_USAGE_READ,
+                                RADEON_PRIO_DRAW_INDIRECT);
+
+      radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
+      radeon_emit(cs, 1);
+      radeon_emit(cs, base_va);
+      radeon_emit(cs, base_va >> 32);
+
+      radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
+      radeon_emit(cs, info->indirect_offset);
+      radeon_emit(cs, dispatch_initiator);
+   } else {
+      radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
+      radeon_emit(cs, info->grid[0]);
+      radeon_emit(cs, info->grid[1]);
+      radeon_emit(cs, info->grid[2]);
+      radeon_emit(cs, dispatch_initiator);
+   }
  }
  
-
-static void si_launch_grid(
-               struct pipe_context *ctx, const struct pipe_grid_info *info)
+static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *info)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_compute *program = sctx->cs_shader_state.program;
-       const amd_kernel_code_t *code_object =
-               si_compute_get_code_object(program, info->pc);
-       int i;
-       /* HW bug workaround when CS threadgroups > 256 threads and async
-        * compute isn't used, i.e. only one compute job can run at a time.
-        * If async compute is possible, the threadgroup size must be limited
-        * to 256 threads on all queues to avoid the bug.
-        * Only GFX6 and certain GFX7 chips are affected.
-        */
-       bool cs_regalloc_hang =
-               (sctx->chip_class == GFX6 ||
-                sctx->family == CHIP_BONAIRE ||
-                sctx->family == CHIP_KABINI) &&
-               info->block[0] * info->block[1] * info->block[2] > 256;
-
-       if (cs_regalloc_hang)
-               sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-                                SI_CONTEXT_CS_PARTIAL_FLUSH;
-
-       if (program->ir_type != PIPE_SHADER_IR_NATIVE &&
-           program->shader.compilation_failed)
-               return;
-
-       if (sctx->has_graphics) {
-               if (sctx->last_num_draw_calls != sctx->num_draw_calls) {
-                       si_update_fb_dirtiness_after_rendering(sctx);
-                       sctx->last_num_draw_calls = sctx->num_draw_calls;
-               }
-
-               si_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE);
-       }
-
-       /* Add buffer sizes for memory checking in need_cs_space. */
-       si_context_add_resource_size(sctx, &program->shader.bo->b.b);
-       /* TODO: add the scratch buffer */
-
-       if (info->indirect) {
-               si_context_add_resource_size(sctx, info->indirect);
-
-               /* Indirect buffers use TC L2 on GFX9, but not older hw. */
-               if (sctx->chip_class <= GFX8 &&
-                   si_resource(info->indirect)->TC_L2_dirty) {
-                       sctx->flags |= SI_CONTEXT_WB_L2;
-                       si_resource(info->indirect)->TC_L2_dirty = false;
-               }
-       }
-
-       si_need_gfx_cs_space(sctx);
-
-       if (sctx->bo_list_add_all_compute_resources)
-               si_compute_resources_add_all_to_bo_list(sctx);
-
-       if (!sctx->cs_shader_state.initialized) {
-               si_emit_initial_compute_regs(sctx, sctx->gfx_cs);
-
-               sctx->cs_shader_state.emitted_program = NULL;
-               sctx->cs_shader_state.initialized = true;
-       }
-
-       if (sctx->flags)
-               sctx->emit_cache_flush(sctx);
-
-       if (!si_switch_compute_shader(sctx, program, &program->shader,
-                                       code_object, info->pc))
-               return;
-
-       si_upload_compute_shader_descriptors(sctx);
-       si_emit_compute_shader_pointers(sctx);
-
-       if (sctx->has_graphics &&
-           si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {
-               sctx->atoms.s.render_cond.emit(sctx);
-               si_set_atom_dirty(sctx, &sctx->atoms.s.render_cond, false);
-       }
-
-       if (program->ir_type == PIPE_SHADER_IR_NATIVE &&
-           unlikely(!si_upload_compute_input(sctx, code_object, info)))
-               return;
-
-       /* Global buffers */
-       for (i = 0; i < program->max_global_buffers; i++) {
-               struct si_resource *buffer =
-                       si_resource(program->global_buffers[i]);
-               if (!buffer) {
-                       continue;
-               }
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer,
-                                         RADEON_USAGE_READWRITE,
-                                         RADEON_PRIO_COMPUTE_GLOBAL);
-       }
-
-       if (program->ir_type != PIPE_SHADER_IR_NATIVE)
-               si_setup_nir_user_data(sctx, info);
-
-       si_emit_dispatch_packets(sctx, info);
-
-       if (unlikely(sctx->current_saved_cs)) {
-               si_trace_emit(sctx);
-               si_log_compute_state(sctx, sctx->log);
-       }
-
-       sctx->compute_is_busy = true;
-       sctx->num_compute_calls++;
-       if (sctx->cs_shader_state.uses_scratch)
-               sctx->num_spill_compute_calls++;
-
-       if (cs_regalloc_hang)
-               sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_compute *program = sctx->cs_shader_state.program;
+   const amd_kernel_code_t *code_object = si_compute_get_code_object(program, info->pc);
+   int i;
+   /* HW bug workaround when CS threadgroups > 256 threads and async
+    * compute isn't used, i.e. only one compute job can run at a time.
+    * If async compute is possible, the threadgroup size must be limited
+    * to 256 threads on all queues to avoid the bug.
+    * Only GFX6 and certain GFX7 chips are affected.
+    */
+   bool cs_regalloc_hang =
+      (sctx->chip_class == GFX6 || sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KABINI) &&
+      info->block[0] * info->block[1] * info->block[2] > 256;
+
+   if (cs_regalloc_hang)
+      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+   if (program->ir_type != PIPE_SHADER_IR_NATIVE && program->shader.compilation_failed)
+      return;
+
+   if (sctx->has_graphics) {
+      if (sctx->last_num_draw_calls != sctx->num_draw_calls) {
+         si_update_fb_dirtiness_after_rendering(sctx);
+         sctx->last_num_draw_calls = sctx->num_draw_calls;
+      }
+
+      si_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE);
+   }
+
+   /* Add buffer sizes for memory checking in need_cs_space. */
+   si_context_add_resource_size(sctx, &program->shader.bo->b.b);
+   /* TODO: add the scratch buffer */
+
+   if (info->indirect) {
+      si_context_add_resource_size(sctx, info->indirect);
+
+      /* Indirect buffers use TC L2 on GFX9, but not older hw. */
+      if (sctx->chip_class <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) {
+         sctx->flags |= SI_CONTEXT_WB_L2;
+         si_resource(info->indirect)->TC_L2_dirty = false;
+      }
+   }
+
+   si_need_gfx_cs_space(sctx);
+
+   if (sctx->bo_list_add_all_compute_resources)
+      si_compute_resources_add_all_to_bo_list(sctx);
+
+   if (!sctx->cs_shader_state.initialized) {
+      si_emit_initial_compute_regs(sctx, sctx->gfx_cs);
+
+      sctx->cs_shader_state.emitted_program = NULL;
+      sctx->cs_shader_state.initialized = true;
+   }
+
+   if (sctx->flags)
+      sctx->emit_cache_flush(sctx);
+
+   if (!si_switch_compute_shader(sctx, program, &program->shader, code_object, info->pc))
+      return;
+
+   si_upload_compute_shader_descriptors(sctx);
+   si_emit_compute_shader_pointers(sctx);
+
+   if (sctx->has_graphics && si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {
+      sctx->atoms.s.render_cond.emit(sctx);
+      si_set_atom_dirty(sctx, &sctx->atoms.s.render_cond, false);
+   }
+
+   if (program->ir_type == PIPE_SHADER_IR_NATIVE &&
+       unlikely(!si_upload_compute_input(sctx, code_object, info)))
+      return;
+
+   /* Global buffers */
+   for (i = 0; i < program->max_global_buffers; i++) {
+      struct si_resource *buffer = si_resource(program->global_buffers[i]);
+      if (!buffer) {
+         continue;
+      }
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer, RADEON_USAGE_READWRITE,
+                                RADEON_PRIO_COMPUTE_GLOBAL);
+   }
+
+   if (program->ir_type != PIPE_SHADER_IR_NATIVE)
+      si_setup_nir_user_data(sctx, info);
+
+   si_emit_dispatch_packets(sctx, info);
+
+   if (unlikely(sctx->current_saved_cs)) {
+      si_trace_emit(sctx);
+      si_log_compute_state(sctx, sctx->log);
+   }
+
+   sctx->compute_is_busy = true;
+   sctx->num_compute_calls++;
+   if (sctx->cs_shader_state.uses_scratch)
+      sctx->num_spill_compute_calls++;
+
+   if (cs_regalloc_hang)
+      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
  }
  
  void si_destroy_compute(struct si_compute *program)
  {
-       struct si_shader_selector *sel = &program->sel;
+   struct si_shader_selector *sel = &program->sel;
  
-       if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
-               util_queue_drop_job(&sel->screen->shader_compiler_queue,
-                                   &sel->ready);
-               util_queue_fence_destroy(&sel->ready);
-       }
+   if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
+      util_queue_drop_job(&sel->screen->shader_compiler_queue, &sel->ready);
+      util_queue_fence_destroy(&sel->ready);
+   }
  
-       for (unsigned i = 0; i < program->max_global_buffers; i++)
-               pipe_resource_reference(&program->global_buffers[i], NULL);
-       FREE(program->global_buffers);
+   for (unsigned i = 0; i < program->max_global_buffers; i++)
+      pipe_resource_reference(&program->global_buffers[i], NULL);
+   FREE(program->global_buffers);
  
-       si_shader_destroy(&program->shader);
-       ralloc_free(program->sel.nir);
-       FREE(program);
+   si_shader_destroy(&program->shader);
+   ralloc_free(program->sel.nir);
+   FREE(program);
  }
  
-static void si_delete_compute_state(struct pipe_context *ctx, void* state){
-       struct si_compute *program = (struct si_compute *)state;
-       struct si_context *sctx = (struct si_context*)ctx;
+static void si_delete_compute_state(struct pipe_context *ctx, void *state)
+{
+   struct si_compute *program = (struct si_compute *)state;
+   struct si_context *sctx = (struct si_context *)ctx;
  
-       if (!state)
-               return;
+   if (!state)
+      return;
  
-       if (program == sctx->cs_shader_state.program)
-               sctx->cs_shader_state.program = NULL;
+   if (program == sctx->cs_shader_state.program)
+      sctx->cs_shader_state.program = NULL;
  
-       if (program == sctx->cs_shader_state.emitted_program)
-               sctx->cs_shader_state.emitted_program = NULL;
+   if (program == sctx->cs_shader_state.emitted_program)
+      sctx->cs_shader_state.emitted_program = NULL;
  
-       si_compute_reference(&program, NULL);
+   si_compute_reference(&program, NULL);
  }
  
-static void si_set_compute_resources(struct pipe_context * ctx_,
-               unsigned start, unsigned count,
-               struct pipe_surface ** surfaces) { }
+static void si_set_compute_resources(struct pipe_context *ctx_, unsigned start, unsigned count,
+                                     struct pipe_surface **surfaces)
+{
+}
  
  void si_init_compute_functions(struct si_context *sctx)
  {
-       sctx->b.create_compute_state = si_create_compute_state;
-       sctx->b.delete_compute_state = si_delete_compute_state;
-       sctx->b.bind_compute_state = si_bind_compute_state;
-       sctx->b.set_compute_resources = si_set_compute_resources;
-       sctx->b.set_global_binding = si_set_global_binding;
-       sctx->b.launch_grid = si_launch_grid;
+   sctx->b.create_compute_state = si_create_compute_state;
+   sctx->b.delete_compute_state = si_delete_compute_state;
+   sctx->b.bind_compute_state = si_bind_compute_state;
+   sctx->b.set_compute_resources = si_set_compute_resources;
+   sctx->b.set_global_binding = si_set_global_binding;
+   sctx->b.launch_grid = si_launch_grid;
  }
diff --git a/src/gallium/drivers/radeonsi/si_compute.h b/src/gallium/drivers/radeonsi/si_compute.h

index 14c3c8cb7896e8ca084152ed71b4712389647b7c..7cf0627185327bec8ef862b35f36db6b2d004cff 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_compute.h
+++ b/src/gallium/drivers/radeonsi/si_compute.h
@@ -25,35 +25,33 @@
  #ifndef SI_COMPUTE_H
  #define SI_COMPUTE_H
  
-#include "util/u_inlines.h"
-
  #include "si_shader.h"
+#include "util/u_inlines.h"
  
  struct si_compute {
-       struct si_shader_selector sel;
-       struct si_shader shader;
+   struct si_shader_selector sel;
+   struct si_shader shader;
  
-       unsigned ir_type;
-       unsigned local_size;
-       unsigned private_size;
-       unsigned input_size;
+   unsigned ir_type;
+   unsigned local_size;
+   unsigned private_size;
+   unsigned input_size;
  
-       int max_global_buffers;
-       struct pipe_resource **global_buffers;
+   int max_global_buffers;
+   struct pipe_resource **global_buffers;
  
-       bool reads_variable_block_size;
-       unsigned num_cs_user_data_dwords;
+   bool reads_variable_block_size;
+   unsigned num_cs_user_data_dwords;
  };
  
  void si_destroy_compute(struct si_compute *program);
  
-static inline void
-si_compute_reference(struct si_compute **dst, struct si_compute *src)
+static inline void si_compute_reference(struct si_compute **dst, struct si_compute *src)
  {
-       if (pipe_reference(&(*dst)->sel.base.reference, &src->sel.base.reference))
-               si_destroy_compute(*dst);
+   if (pipe_reference(&(*dst)->sel.base.reference, &src->sel.base.reference))
+      si_destroy_compute(*dst);
  
-       *dst = src;
+   *dst = src;
  }
  
  #endif /* SI_COMPUTE_H */
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c

index de020bfaf8c2e6a4656e9edc3dbc6017670ec507..6e3b07cb7c8edb484a27d25abff0a897f2da8528 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -30,758 +30,705 @@
  /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
   * and L2_STREAM for src.
   */
-static enum si_cache_policy get_cache_policy(struct si_context *sctx,
-                                            enum si_coherency coher,
-                                            uint64_t size)
+static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher,
+                                             uint64_t size)
  {
-       if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
-                                         coher == SI_COHERENCY_CP)) ||
-           (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
-               return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
+   if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META || coher == SI_COHERENCY_CP)) ||
+       (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
+      return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
  
-       return L2_BYPASS;
+   return L2_BYPASS;
  }
  
  unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
-                           enum si_cache_policy cache_policy)
+                            enum si_cache_policy cache_policy)
  {
-       switch (coher) {
-       default:
-       case SI_COHERENCY_NONE:
-       case SI_COHERENCY_CP:
-               return 0;
-       case SI_COHERENCY_SHADER:
-               return SI_CONTEXT_INV_SCACHE |
-                      SI_CONTEXT_INV_VCACHE |
-                      (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0);
-       case SI_COHERENCY_CB_META:
-               return SI_CONTEXT_FLUSH_AND_INV_CB;
-       }
+   switch (coher) {
+   default:
+   case SI_COHERENCY_NONE:
+   case SI_COHERENCY_CP:
+      return 0;
+   case SI_COHERENCY_SHADER:
+      return SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+             (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0);
+   case SI_COHERENCY_CB_META:
+      return SI_CONTEXT_FLUSH_AND_INV_CB;
+   }
  }
  
-static void si_launch_grid_internal(struct si_context *sctx,
-                                   struct pipe_grid_info *info)
+static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info)
  {
-       /* Set settings for driver-internal compute dispatches. */
-       sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
-       sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
-       sctx->render_cond_force_off = true;
-       /* Skip decompression to prevent infinite recursion. */
-       sctx->blitter->running = true;
-
-       /* Dispatch compute. */
-       sctx->b.launch_grid(&sctx->b, info);
-
-       /* Restore default settings. */
-       sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
-       sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
-       sctx->render_cond_force_off = false;
-       sctx->blitter->running = false;
+   /* Set settings for driver-internal compute dispatches. */
+   sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
+   sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
+   sctx->render_cond_force_off = true;
+   /* Skip decompression to prevent infinite recursion. */
+   sctx->blitter->running = true;
+
+   /* Dispatch compute. */
+   sctx->b.launch_grid(&sctx->b, info);
+
+   /* Restore default settings. */
+   sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
+   sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
+   sctx->render_cond_force_off = false;
+   sctx->blitter->running = false;
  }
  
-static void si_compute_clear_12bytes_buffer(struct si_context *sctx,
-                                       struct pipe_resource *dst,
-                                       unsigned dst_offset,
-                                       unsigned size,
-                                       const uint32_t *clear_value,
-                                       enum si_coherency coher)
+static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst,
+                                            unsigned dst_offset, unsigned size,
+                                            const uint32_t *clear_value, enum si_coherency coher)
  {
-       struct pipe_context *ctx = &sctx->b;
+   struct pipe_context *ctx = &sctx->b;
  
-       assert(dst_offset % 4 == 0);
-       assert(size % 4 == 0);
-       unsigned size_12 = DIV_ROUND_UP(size, 12);
+   assert(dst_offset % 4 == 0);
+   assert(size % 4 == 0);
+   unsigned size_12 = DIV_ROUND_UP(size, 12);
  
-       unsigned data[4] = {0};
-       memcpy(data, clear_value, 12);
+   unsigned data[4] = {0};
+   memcpy(data, clear_value, 12);
  
-       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-                      SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+                  si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
  
-       struct pipe_shader_buffer saved_sb = {0};
-       si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
+   struct pipe_shader_buffer saved_sb = {0};
+   si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
  
-       unsigned saved_writable_mask = 0;
-       if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
-           (1u << si_get_shaderbuf_slot(0)))
-               saved_writable_mask = 1;
+   unsigned saved_writable_mask = 0;
+   if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+       (1u << si_get_shaderbuf_slot(0)))
+      saved_writable_mask = 1;
  
-       struct pipe_constant_buffer saved_cb = {};
-       si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+   struct pipe_constant_buffer saved_cb = {};
+   si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
  
-       void *saved_cs = sctx->cs_shader_state.program;
+   void *saved_cs = sctx->cs_shader_state.program;
  
-       struct pipe_constant_buffer cb = {};
-       cb.buffer_size = sizeof(data);
-       cb.user_buffer = data;
-       ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
+   struct pipe_constant_buffer cb = {};
+   cb.buffer_size = sizeof(data);
+   cb.user_buffer = data;
+   ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
  
-       struct pipe_shader_buffer sb = {0};
-       sb.buffer = dst;
-       sb.buffer_offset = dst_offset;
-       sb.buffer_size = size;
+   struct pipe_shader_buffer sb = {0};
+   sb.buffer = dst;
+   sb.buffer_offset = dst_offset;
+   sb.buffer_size = size;
  
-       ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
+   ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
  
-       struct pipe_grid_info info = {0};
+   struct pipe_grid_info info = {0};
  
-       if (!sctx->cs_clear_12bytes_buffer)
-               sctx->cs_clear_12bytes_buffer =
-                       si_clear_12bytes_buffer_shader(ctx);
-       ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer);
-       info.block[0] = 64;
-       info.last_block[0] = size_12 % 64;
-       info.block[1] = 1;
-       info.block[2] = 1;
-       info.grid[0] = DIV_ROUND_UP(size_12, 64);
-       info.grid[1] = 1;
-       info.grid[2] = 1;
+   if (!sctx->cs_clear_12bytes_buffer)
+      sctx->cs_clear_12bytes_buffer = si_clear_12bytes_buffer_shader(ctx);
+   ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer);
+   info.block[0] = 64;
+   info.last_block[0] = size_12 % 64;
+   info.block[1] = 1;
+   info.block[2] = 1;
+   info.grid[0] = DIV_ROUND_UP(size_12, 64);
+   info.grid[1] = 1;
+   info.grid[2] = 1;
  
-       si_launch_grid_internal(sctx, &info);
+   si_launch_grid_internal(sctx, &info);
  
-       ctx->bind_compute_state(ctx, saved_cs);
-       ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
-       ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+   ctx->bind_compute_state(ctx, saved_cs);
+   ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
+   ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
  
-       pipe_resource_reference(&saved_sb.buffer, NULL);
-       pipe_resource_reference(&saved_cb.buffer, NULL);
+   pipe_resource_reference(&saved_sb.buffer, NULL);
+   pipe_resource_reference(&saved_cb.buffer, NULL);
  }
  
-static void si_compute_do_clear_or_copy(struct si_context *sctx,
-                                       struct pipe_resource *dst,
-                                       unsigned dst_offset,
-                                       struct pipe_resource *src,
-                                       unsigned src_offset,
-                                       unsigned size,
-                                       const uint32_t *clear_value,
-                                       unsigned clear_value_size,
-                                       enum si_coherency coher)
+static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_resource *dst,
+                                        unsigned dst_offset, struct pipe_resource *src,
+                                        unsigned src_offset, unsigned size,
+                                        const uint32_t *clear_value, unsigned clear_value_size,
+                                        enum si_coherency coher)
  {
-       struct pipe_context *ctx = &sctx->b;
-
-       assert(src_offset % 4 == 0);
-       assert(dst_offset % 4 == 0);
-       assert(size % 4 == 0);
-
-       assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
-       assert(!src || src_offset + size <= src->width0);
-
-       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-                      SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
-
-       /* Save states. */
-       void *saved_cs = sctx->cs_shader_state.program;
-       struct pipe_shader_buffer saved_sb[2] = {};
-       si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
-
-       unsigned saved_writable_mask = 0;
-       for (unsigned i = 0; i < (src ? 2 : 1); i++) {
-               if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
-                   (1u << si_get_shaderbuf_slot(i)))
-                       saved_writable_mask |= 1 << i;
-       }
-
-       /* The memory accesses are coalesced, meaning that the 1st instruction writes
-        * the 1st contiguous block of data for the whole wave, the 2nd instruction
-        * writes the 2nd contiguous block of data, etc.
-        */
-       unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD :
-                                          SI_COMPUTE_CLEAR_DW_PER_THREAD;
-       unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
-       unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
-       unsigned wave_size = sctx->screen->compute_wave_size;
-       unsigned dwords_per_wave = dwords_per_thread * wave_size;
-
-       unsigned num_dwords = size / 4;
-       unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
-
-       struct pipe_grid_info info = {};
-       info.block[0] = MIN2(wave_size, num_instructions);
-       info.block[1] = 1;
-       info.block[2] = 1;
-       info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
-       info.grid[1] = 1;
-       info.grid[2] = 1;
-
-       struct pipe_shader_buffer sb[2] = {};
-       sb[0].buffer = dst;
-       sb[0].buffer_offset = dst_offset;
-       sb[0].buffer_size = size;
-
-       bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
-
-       if (src) {
-               sb[1].buffer = src;
-               sb[1].buffer_offset = src_offset;
-               sb[1].buffer_size = size;
-
-               ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1);
-
-               if (!sctx->cs_copy_buffer) {
-                       sctx->cs_copy_buffer = si_create_dma_compute_shader(&sctx->b,
-                                                            SI_COMPUTE_COPY_DW_PER_THREAD,
-                                                            shader_dst_stream_policy, true);
-               }
-               ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
-       } else {
-               assert(clear_value_size >= 4 &&
-                      clear_value_size <= 16 &&
-                      util_is_power_of_two_or_zero(clear_value_size));
-
-               for (unsigned i = 0; i < 4; i++)
-                       sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
-
-               ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1);
-
-               if (!sctx->cs_clear_buffer) {
-                       sctx->cs_clear_buffer = si_create_dma_compute_shader(&sctx->b,
-                                                            SI_COMPUTE_CLEAR_DW_PER_THREAD,
-                                                            shader_dst_stream_policy, false);
-               }
-               ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
-       }
-
-       si_launch_grid_internal(sctx, &info);
-
-       enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
-       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0);
-
-       if (cache_policy != L2_BYPASS)
-               si_resource(dst)->TC_L2_dirty = true;
-
-       /* Restore states. */
-       ctx->bind_compute_state(ctx, saved_cs);
-       ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb,
-                               saved_writable_mask);
-       for (int i = 0; i < 2; i++)
-               pipe_resource_reference(&saved_sb[i].buffer, NULL);
+   struct pipe_context *ctx = &sctx->b;
+
+   assert(src_offset % 4 == 0);
+   assert(dst_offset % 4 == 0);
+   assert(size % 4 == 0);
+
+   assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
+   assert(!src || src_offset + size <= src->width0);
+
+   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+                  si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+
+   /* Save states. */
+   void *saved_cs = sctx->cs_shader_state.program;
+   struct pipe_shader_buffer saved_sb[2] = {};
+   si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
+
+   unsigned saved_writable_mask = 0;
+   for (unsigned i = 0; i < (src ? 2 : 1); i++) {
+      if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+          (1u << si_get_shaderbuf_slot(i)))
+         saved_writable_mask |= 1 << i;
+   }
+
+   /* The memory accesses are coalesced, meaning that the 1st instruction writes
+    * the 1st contiguous block of data for the whole wave, the 2nd instruction
+    * writes the 2nd contiguous block of data, etc.
+    */
+   unsigned dwords_per_thread =
+      src ? SI_COMPUTE_COPY_DW_PER_THREAD : SI_COMPUTE_CLEAR_DW_PER_THREAD;
+   unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
+   unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
+   unsigned wave_size = sctx->screen->compute_wave_size;
+   unsigned dwords_per_wave = dwords_per_thread * wave_size;
+
+   unsigned num_dwords = size / 4;
+   unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+   struct pipe_grid_info info = {};
+   info.block[0] = MIN2(wave_size, num_instructions);
+   info.block[1] = 1;
+   info.block[2] = 1;
+   info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+   info.grid[1] = 1;
+   info.grid[2] = 1;
+
+   struct pipe_shader_buffer sb[2] = {};
+   sb[0].buffer = dst;
+   sb[0].buffer_offset = dst_offset;
+   sb[0].buffer_size = size;
+
+   bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
+
+   if (src) {
+      sb[1].buffer = src;
+      sb[1].buffer_offset = src_offset;
+      sb[1].buffer_size = size;
+
+      ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1);
+
+      if (!sctx->cs_copy_buffer) {
+         sctx->cs_copy_buffer = si_create_dma_compute_shader(
+            &sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
+      }
+      ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
+   } else {
+      assert(clear_value_size >= 4 && clear_value_size <= 16 &&
+             util_is_power_of_two_or_zero(clear_value_size));
+
+      for (unsigned i = 0; i < 4; i++)
+         sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
+
+      ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1);
+
+      if (!sctx->cs_clear_buffer) {
+         sctx->cs_clear_buffer = si_create_dma_compute_shader(
+            &sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
+      }
+      ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
+   }
+
+   si_launch_grid_internal(sctx, &info);
+
+   enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0);
+
+   if (cache_policy != L2_BYPASS)
+      si_resource(dst)->TC_L2_dirty = true;
+
+   /* Restore states. */
+   ctx->bind_compute_state(ctx, saved_cs);
+   ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb, saved_writable_mask);
+   for (int i = 0; i < 2; i++)
+      pipe_resource_reference(&saved_sb[i].buffer, NULL);
  }
  
-void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
-                    uint64_t offset, uint64_t size, uint32_t *clear_value,
-                    uint32_t clear_value_size, enum si_coherency coher,
-                    bool force_cpdma)
+void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
+                     uint64_t size, uint32_t *clear_value, uint32_t clear_value_size,
+                     enum si_coherency coher, bool force_cpdma)
  {
-       if (!size)
-               return;
-
-       ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4);
-
-       assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
-       assert(offset % clear_alignment == 0);
-       assert(size % clear_alignment == 0);
-       assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
-
-       /* Reduce a large clear value size if possible. */
-       if (clear_value_size > 4) {
-               bool clear_dword_duplicated = true;
-
-               /* See if we can lower large fills to dword fills. */
-               for (unsigned i = 1; i < clear_value_size / 4; i++) {
-                       if (clear_value[0] != clear_value[i]) {
-                               clear_dword_duplicated = false;
-                               break;
-                       }
-               }
-               if (clear_dword_duplicated)
-                       clear_value_size = 4;
-       }
-
-       /* Expand a small clear value size. */
-       uint32_t tmp_clear_value;
-       if (clear_value_size <= 2) {
-               if (clear_value_size == 1) {
-                       tmp_clear_value = *(uint8_t*)clear_value;
-                       tmp_clear_value |= (tmp_clear_value << 8) |
-                                          (tmp_clear_value << 16) |
-                                          (tmp_clear_value << 24);
-               } else {
-                       tmp_clear_value = *(uint16_t*)clear_value;
-                       tmp_clear_value |= tmp_clear_value << 16;
-               }
-               clear_value = &tmp_clear_value;
-               clear_value_size = 4;
-       }
-
-       if (clear_value_size == 12) {
-               si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher);
-               return;
-       }
-
-       uint64_t aligned_size = size & ~3ull;
-       if (aligned_size >= 4) {
-               /* Before GFX9, CP DMA was very slow when clearing GTT, so never
-                * use CP DMA clears on those chips, because we can't be certain
-                * about buffer placements.
-                */
-               if (clear_value_size > 4 ||
-                   (!force_cpdma &&
-                    clear_value_size == 4 &&
-                    offset % 4 == 0 &&
-                    (size > 32*1024 || sctx->chip_class <= GFX9))) {
-                       si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
-                                                   aligned_size, clear_value,
-                                                   clear_value_size, coher);
-               } else {
-                       assert(clear_value_size == 4);
-                       si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset,
-                                              aligned_size, *clear_value, 0, coher,
-                                              get_cache_policy(sctx, coher, size));
-               }
-
-               offset += aligned_size;
-               size -= aligned_size;
-       }
-
-       /* Handle non-dword alignment. */
-       if (size) {
-               assert(dst);
-               assert(dst->target == PIPE_BUFFER);
-               assert(size < 4);
-
-               pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
-       }
+   if (!size)
+      return;
+
+   ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4);
+
+   assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
+   assert(offset % clear_alignment == 0);
+   assert(size % clear_alignment == 0);
+   assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
+
+   /* Reduce a large clear value size if possible. */
+   if (clear_value_size > 4) {
+      bool clear_dword_duplicated = true;
+
+      /* See if we can lower large fills to dword fills. */
+      for (unsigned i = 1; i < clear_value_size / 4; i++) {
+         if (clear_value[0] != clear_value[i]) {
+            clear_dword_duplicated = false;
+            break;
+         }
+      }
+      if (clear_dword_duplicated)
+         clear_value_size = 4;
+   }
+
+   /* Expand a small clear value size. */
+   uint32_t tmp_clear_value;
+   if (clear_value_size <= 2) {
+      if (clear_value_size == 1) {
+         tmp_clear_value = *(uint8_t *)clear_value;
+         tmp_clear_value |=
+            (tmp_clear_value << 8) | (tmp_clear_value << 16) | (tmp_clear_value << 24);
+      } else {
+         tmp_clear_value = *(uint16_t *)clear_value;
+         tmp_clear_value |= tmp_clear_value << 16;
+      }
+      clear_value = &tmp_clear_value;
+      clear_value_size = 4;
+   }
+
+   if (clear_value_size == 12) {
+      si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher);
+      return;
+   }
+
+   uint64_t aligned_size = size & ~3ull;
+   if (aligned_size >= 4) {
+      /* Before GFX9, CP DMA was very slow when clearing GTT, so never
+       * use CP DMA clears on those chips, because we can't be certain
+       * about buffer placements.
+       */
+      if (clear_value_size > 4 || (!force_cpdma && clear_value_size == 4 && offset % 4 == 0 &&
+                                   (size > 32 * 1024 || sctx->chip_class <= GFX9))) {
+         si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value,
+                                     clear_value_size, coher);
+      } else {
+         assert(clear_value_size == 4);
+         si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset, aligned_size, *clear_value, 0,
+                                coher, get_cache_policy(sctx, coher, size));
+      }
+
+      offset += aligned_size;
+      size -= aligned_size;
+   }
+
+   /* Handle non-dword alignment. */
+   if (size) {
+      assert(dst);
+      assert(dst->target == PIPE_BUFFER);
+      assert(size < 4);
+
+      pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
+   }
  }
  
-static void si_pipe_clear_buffer(struct pipe_context *ctx,
-                                struct pipe_resource *dst,
-                                unsigned offset, unsigned size,
-                                const void *clear_value,
-                                int clear_value_size)
+static void si_pipe_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
+                                 unsigned offset, unsigned size, const void *clear_value,
+                                 int clear_value_size)
  {
-       si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
-                       clear_value_size, SI_COHERENCY_SHADER, false);
+   si_clear_buffer((struct si_context *)ctx, dst, offset, size, (uint32_t *)clear_value,
+                   clear_value_size, SI_COHERENCY_SHADER, false);
  }
  
-void si_copy_buffer(struct si_context *sctx,
-                   struct pipe_resource *dst, struct pipe_resource *src,
-                   uint64_t dst_offset, uint64_t src_offset, unsigned size)
+void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
+                    uint64_t dst_offset, uint64_t src_offset, unsigned size)
  {
-       if (!size)
-               return;
-
-       enum si_coherency coher = SI_COHERENCY_SHADER;
-       enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
-
-       /* Only use compute for VRAM copies on dGPUs. */
-       if (sctx->screen->info.has_dedicated_vram &&
-           si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
-           si_resource(src)->domains & RADEON_DOMAIN_VRAM &&
-           size > 32 * 1024 &&
-           dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
-               si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset,
-                                           size, NULL, 0, coher);
-       } else {
-               si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
-                                     0, coher, cache_policy);
-       }
+   if (!size)
+      return;
+
+   enum si_coherency coher = SI_COHERENCY_SHADER;
+   enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+
+   /* Only use compute for VRAM copies on dGPUs. */
+   if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
+       si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > 32 * 1024 && dst_offset % 4 == 0 &&
+       src_offset % 4 == 0 && size % 4 == 0) {
+      si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0, coher);
+   } else {
+      si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, 0, coher, cache_policy);
+   }
  }
  
-void si_compute_copy_image(struct si_context *sctx,
-                          struct pipe_resource *dst,
-                          unsigned dst_level,
-                          struct pipe_resource *src,
-                          unsigned src_level,
-                          unsigned dstx, unsigned dsty, unsigned dstz,
-                          const struct pipe_box *src_box)
+void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
+                           struct pipe_resource *src, unsigned src_level, unsigned dstx,
+                           unsigned dsty, unsigned dstz, const struct pipe_box *src_box)
  {
-       struct pipe_context *ctx = &sctx->b;
-       unsigned width = src_box->width;
-       unsigned height = src_box->height;
-       unsigned depth = src_box->depth;
-       enum pipe_format src_format = util_format_linear(src->format);
-       enum pipe_format dst_format = util_format_linear(dst->format);
-
-       assert(util_format_is_subsampled_422(src_format) ==
-              util_format_is_subsampled_422(dst_format));
-
-       if (util_format_is_subsampled_422(src_format)) {
-               src_format = dst_format = PIPE_FORMAT_R32_UINT;
-               /* Interpreting 422 subsampled format (16 bpp) as 32 bpp
-                * should force us to divide src_box->x, dstx and width by 2.
-                * But given that ac_surface allocates this format as 32 bpp
-                * and that surf_size is then modified to pack the values
-                * we must keep the original values to get the correct results.
-                */
-       }
-       unsigned data[] = {src_box->x, src_box->y, src_box->z, 0,
-                          dstx, dsty, dstz, 0};
-
-       if (width == 0 || height == 0)
-               return;
-
-       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-
-       /* The driver doesn't decompress resources automatically here. */
-       si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level,
-                                 dstz, dstz + src_box->depth - 1);
-       si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level,
-                                 src_box->z, src_box->z + src_box->depth - 1);
-
-       /* src and dst have the same number of samples. */
-       si_make_CB_shader_coherent(sctx, src->nr_samples, true,
-                                  /* Only src can have DCC.*/
-                                  ((struct si_texture*)src)->surface.u.gfx9.dcc.pipe_aligned);
-
-       struct pipe_constant_buffer saved_cb = {};
-       si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
-
-       struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
-       struct pipe_image_view saved_image[2] = {0};
-       util_copy_image_view(&saved_image[0], &images->views[0]);
-       util_copy_image_view(&saved_image[1], &images->views[1]);
-
-       void *saved_cs = sctx->cs_shader_state.program;
-
-       struct pipe_constant_buffer cb = {};
-       cb.buffer_size = sizeof(data);
-       cb.user_buffer = data;
-       ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
-
-       struct pipe_image_view image[2] = {0};
-       image[0].resource = src;
-       image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
-       image[0].format = src_format;
-       image[0].u.tex.level = src_level;
-       image[0].u.tex.first_layer = 0;
-       image[0].u.tex.last_layer =
-               src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
-                                               : (unsigned)(src->array_size - 1);
-       image[1].resource = dst;
-       image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
-       image[1].format = dst_format;
-       image[1].u.tex.level = dst_level;
-       image[1].u.tex.first_layer = 0;
-       image[1].u.tex.last_layer =
-               dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
-                                               : (unsigned)(dst->array_size - 1);
-
-       if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
-               image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
-
-       /* SNORM8 blitting has precision issues on some chips. Use the SINT
-        * equivalent instead, which doesn't force DCC decompression.
-        * Note that some chips avoid this issue by using SDMA.
-        */
-       if (util_format_is_snorm8(dst->format)) {
-               image[0].format = image[1].format =
-                       util_format_snorm8_to_sint8(dst->format);
-       }
-
-       ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
-
-       struct pipe_grid_info info = {0};
-
-       if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
-               if (!sctx->cs_copy_image_1d_array)
-                       sctx->cs_copy_image_1d_array =
-                               si_create_copy_image_compute_shader_1d_array(ctx);
-               ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
-               info.block[0] = 64;
-               info.last_block[0] = width % 64;
-               info.block[1] = 1;
-               info.block[2] = 1;
-               info.grid[0] = DIV_ROUND_UP(width, 64);
-               info.grid[1] = depth;
-               info.grid[2] = 1;
-       } else {
-               if (!sctx->cs_copy_image)
-                       sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
-               ctx->bind_compute_state(ctx, sctx->cs_copy_image);
-               info.block[0] = 8;
-               info.last_block[0] = width % 8;
-               info.block[1] = 8;
-               info.last_block[1] = height % 8;
-               info.block[2] = 1;
-               info.grid[0] = DIV_ROUND_UP(width, 8);
-               info.grid[1] = DIV_ROUND_UP(height, 8);
-               info.grid[2] = depth;
-       }
-
-       si_launch_grid_internal(sctx, &info);
-
-       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
-                      si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-       ctx->bind_compute_state(ctx, saved_cs);
-       ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
-       ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
-       for (int i = 0; i < 2; i++)
-               pipe_resource_reference(&saved_image[i].resource, NULL);
-       pipe_resource_reference(&saved_cb.buffer, NULL);
+   struct pipe_context *ctx = &sctx->b;
+   unsigned width = src_box->width;
+   unsigned height = src_box->height;
+   unsigned depth = src_box->depth;
+   enum pipe_format src_format = util_format_linear(src->format);
+   enum pipe_format dst_format = util_format_linear(dst->format);
+
+   assert(util_format_is_subsampled_422(src_format) == util_format_is_subsampled_422(dst_format));
+
+   if (util_format_is_subsampled_422(src_format)) {
+      src_format = dst_format = PIPE_FORMAT_R32_UINT;
+      /* Interpreting 422 subsampled format (16 bpp) as 32 bpp
+       * should force us to divide src_box->x, dstx and width by 2.
+       * But given that ac_surface allocates this format as 32 bpp
+       * and that surf_size is then modified to pack the values
+       * we must keep the original values to get the correct results.
+       */
+   }
+   unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0};
+
+   if (width == 0 || height == 0)
+      return;
+
+   sctx->flags |=
+      SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+
+   /* The driver doesn't decompress resources automatically here. */
+   si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, dstz,
+                             dstz + src_box->depth - 1);
+   si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z,
+                             src_box->z + src_box->depth - 1);
+
+   /* src and dst have the same number of samples. */
+   si_make_CB_shader_coherent(sctx, src->nr_samples, true,
+                              /* Only src can have DCC.*/
+                              ((struct si_texture *)src)->surface.u.gfx9.dcc.pipe_aligned);
+
+   struct pipe_constant_buffer saved_cb = {};
+   si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+
+   struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
+   struct pipe_image_view saved_image[2] = {0};
+   util_copy_image_view(&saved_image[0], &images->views[0]);
+   util_copy_image_view(&saved_image[1], &images->views[1]);
+
+   void *saved_cs = sctx->cs_shader_state.program;
+
+   struct pipe_constant_buffer cb = {};
+   cb.buffer_size = sizeof(data);
+   cb.user_buffer = data;
+   ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
+
+   struct pipe_image_view image[2] = {0};
+   image[0].resource = src;
+   image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
+   image[0].format = src_format;
+   image[0].u.tex.level = src_level;
+   image[0].u.tex.first_layer = 0;
+   image[0].u.tex.last_layer = src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
+                                                              : (unsigned)(src->array_size - 1);
+   image[1].resource = dst;
+   image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
+   image[1].format = dst_format;
+   image[1].u.tex.level = dst_level;
+   image[1].u.tex.first_layer = 0;
+   image[1].u.tex.last_layer = dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
+                                                              : (unsigned)(dst->array_size - 1);
+
+   if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
+      image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
+
+   /* SNORM8 blitting has precision issues on some chips. Use the SINT
+    * equivalent instead, which doesn't force DCC decompression.
+    * Note that some chips avoid this issue by using SDMA.
+    */
+   if (util_format_is_snorm8(dst->format)) {
+      image[0].format = image[1].format = util_format_snorm8_to_sint8(dst->format);
+   }
+
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
+
+   struct pipe_grid_info info = {0};
+
+   if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
+      if (!sctx->cs_copy_image_1d_array)
+         sctx->cs_copy_image_1d_array = si_create_copy_image_compute_shader_1d_array(ctx);
+      ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
+      info.block[0] = 64;
+      info.last_block[0] = width % 64;
+      info.block[1] = 1;
+      info.block[2] = 1;
+      info.grid[0] = DIV_ROUND_UP(width, 64);
+      info.grid[1] = depth;
+      info.grid[2] = 1;
+   } else {
+      if (!sctx->cs_copy_image)
+         sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
+      ctx->bind_compute_state(ctx, sctx->cs_copy_image);
+      info.block[0] = 8;
+      info.last_block[0] = width % 8;
+      info.block[1] = 8;
+      info.last_block[1] = height % 8;
+      info.block[2] = 1;
+      info.grid[0] = DIV_ROUND_UP(width, 8);
+      info.grid[1] = DIV_ROUND_UP(height, 8);
+      info.grid[2] = depth;
+   }
+
+   si_launch_grid_internal(sctx, &info);
+
+   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
+                  si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+   ctx->bind_compute_state(ctx, saved_cs);
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
+   ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+   for (int i = 0; i < 2; i++)
+      pipe_resource_reference(&saved_image[i].resource, NULL);
+   pipe_resource_reference(&saved_cb.buffer, NULL);
  }
  
  void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
  {
-       struct pipe_context *ctx = &sctx->b;
-
-       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-                      SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) |
-                      si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU);
-       sctx->emit_cache_flush(sctx);
-
-       /* Save states. */
-       void *saved_cs = sctx->cs_shader_state.program;
-       struct pipe_image_view saved_img[3] = {};
-
-       for (unsigned i = 0; i < 3; i++) {
-               util_copy_image_view(&saved_img[i],
-                                    &sctx->images[PIPE_SHADER_COMPUTE].views[i]);
-       }
-
-       /* Set images. */
-       bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
-       unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
-       struct pipe_image_view img[3];
-
-       assert(tex->surface.dcc_retile_map_offset && tex->surface.dcc_retile_map_offset <= UINT_MAX);
-       assert(tex->surface.dcc_offset && tex->surface.dcc_offset <= UINT_MAX);
-       assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX);
-
-       for (unsigned i = 0; i < 3; i++) {
-               img[i].resource = &tex->buffer.b.b;
-               img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ;
-               img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER;
-       }
-
-       img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT :
-                                    PIPE_FORMAT_R32G32B32A32_UINT;
-       img[0].u.buf.offset = tex->surface.dcc_retile_map_offset;
-       img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4);
-
-       img[1].format = PIPE_FORMAT_R8_UINT;
-       img[1].u.buf.offset = tex->surface.dcc_offset;
-       img[1].u.buf.size = tex->surface.dcc_size;
-
-       img[2].format = PIPE_FORMAT_R8_UINT;
-       img[2].u.buf.offset = tex->surface.display_dcc_offset;
-       img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size;
-
-       ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img);
-
-       /* Bind the compute shader. */
-       if (!sctx->cs_dcc_retile)
-               sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx);
-       ctx->bind_compute_state(ctx, sctx->cs_dcc_retile);
-
-       /* Dispatch compute. */
-       /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */
-       unsigned num_threads = num_elements / 4;
-
-       struct pipe_grid_info info = {};
-       info.block[0] = 64;
-       info.block[1] = 1;
-       info.block[2] = 1;
-       info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */
-       info.grid[1] = 1;
-       info.grid[2] = 1;
-       info.last_block[0] = num_threads % 64;
-
-       si_launch_grid_internal(sctx, &info);
-
-       /* Don't flush caches or wait. The driver will wait at the end of this IB,
-        * and L2 will be flushed by the kernel fence.
-        */
-
-       /* Restore states. */
-       ctx->bind_compute_state(ctx, saved_cs);
-       ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img);
-
-       for (unsigned i = 0; i < 3; i++) {
-               pipe_resource_reference(&saved_img[i].resource, NULL);
-       }
+   struct pipe_context *ctx = &sctx->b;
+
+   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+                  si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) |
+                  si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU);
+   sctx->emit_cache_flush(sctx);
+
+   /* Save states. */
+   void *saved_cs = sctx->cs_shader_state.program;
+   struct pipe_image_view saved_img[3] = {};
+
+   for (unsigned i = 0; i < 3; i++) {
+      util_copy_image_view(&saved_img[i], &sctx->images[PIPE_SHADER_COMPUTE].views[i]);
+   }
+
+   /* Set images. */
+   bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
+   unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
+   struct pipe_image_view img[3];
+
+   assert(tex->surface.dcc_retile_map_offset && tex->surface.dcc_retile_map_offset <= UINT_MAX);
+   assert(tex->surface.dcc_offset && tex->surface.dcc_offset <= UINT_MAX);
+   assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX);
+
+   for (unsigned i = 0; i < 3; i++) {
+      img[i].resource = &tex->buffer.b.b;
+      img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ;
+      img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER;
+   }
+
+   img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT : PIPE_FORMAT_R32G32B32A32_UINT;
+   img[0].u.buf.offset = tex->surface.dcc_retile_map_offset;
+   img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4);
+
+   img[1].format = PIPE_FORMAT_R8_UINT;
+   img[1].u.buf.offset = tex->surface.dcc_offset;
+   img[1].u.buf.size = tex->surface.dcc_size;
+
+   img[2].format = PIPE_FORMAT_R8_UINT;
+   img[2].u.buf.offset = tex->surface.display_dcc_offset;
+   img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size;
+
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img);
+
+   /* Bind the compute shader. */
+   if (!sctx->cs_dcc_retile)
+      sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx);
+   ctx->bind_compute_state(ctx, sctx->cs_dcc_retile);
+
+   /* Dispatch compute. */
+   /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */
+   unsigned num_threads = num_elements / 4;
+
+   struct pipe_grid_info info = {};
+   info.block[0] = 64;
+   info.block[1] = 1;
+   info.block[2] = 1;
+   info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */
+   info.grid[1] = 1;
+   info.grid[2] = 1;
+   info.last_block[0] = num_threads % 64;
+
+   si_launch_grid_internal(sctx, &info);
+
+   /* Don't flush caches or wait. The driver will wait at the end of this IB,
+    * and L2 will be flushed by the kernel fence.
+    */
+
+   /* Restore states. */
+   ctx->bind_compute_state(ctx, saved_cs);
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img);
+
+   for (unsigned i = 0; i < 3; i++) {
+      pipe_resource_reference(&saved_img[i].resource, NULL);
+   }
  }
  
  /* Expand FMASK to make it identity, so that image stores can ignore it. */
  void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       bool is_array = tex->target == PIPE_TEXTURE_2D_ARRAY;
-       unsigned log_fragments = util_logbase2(tex->nr_storage_samples);
-       unsigned log_samples = util_logbase2(tex->nr_samples);
-       assert(tex->nr_samples >= 2);
-
-       /* EQAA FMASK expansion is unimplemented. */
-       if (tex->nr_samples != tex->nr_storage_samples)
-               return;
-
-       /* Flush caches and sync engines. */
-       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-       si_make_CB_shader_coherent(sctx, tex->nr_samples, true,
-                                  true /* DCC is not possible with image stores */);
-
-       /* Save states. */
-       void *saved_cs = sctx->cs_shader_state.program;
-       struct pipe_image_view saved_image = {0};
-       util_copy_image_view(&saved_image, &sctx->images[PIPE_SHADER_COMPUTE].views[0]);
-
-       /* Bind the image. */
-       struct pipe_image_view image = {0};
-       image.resource = tex;
-       /* Don't set WRITE so as not to trigger FMASK expansion, causing
-        * an infinite loop. */
-       image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ;
-       image.format = util_format_linear(tex->format);
-       if (is_array)
-               image.u.tex.last_layer = tex->array_size - 1;
-
-       ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
-
-       /* Bind the shader. */
-       void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array];
-       if (!*shader)
-               *shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array);
-       ctx->bind_compute_state(ctx, *shader);
-
-       /* Dispatch compute. */
-       struct pipe_grid_info info = {0};
-       info.block[0] = 8;
-       info.last_block[0] = tex->width0 % 8;
-       info.block[1] = 8;
-       info.last_block[1] = tex->height0 % 8;
-       info.block[2] = 1;
-       info.grid[0] = DIV_ROUND_UP(tex->width0, 8);
-       info.grid[1] = DIV_ROUND_UP(tex->height0, 8);
-       info.grid[2] = is_array ? tex->array_size : 1;
-
-       si_launch_grid_internal(sctx, &info);
-
-       /* Flush caches and sync engines. */
-       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
-                      si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-
-       /* Restore previous states. */
-       ctx->bind_compute_state(ctx, saved_cs);
-       ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
-       pipe_resource_reference(&saved_image.resource, NULL);
-
-       /* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */
+   struct si_context *sctx = (struct si_context *)ctx;
+   bool is_array = tex->target == PIPE_TEXTURE_2D_ARRAY;
+   unsigned log_fragments = util_logbase2(tex->nr_storage_samples);
+   unsigned log_samples = util_logbase2(tex->nr_samples);
+   assert(tex->nr_samples >= 2);
+
+   /* EQAA FMASK expansion is unimplemented. */
+   if (tex->nr_samples != tex->nr_storage_samples)
+      return;
+
+   /* Flush caches and sync engines. */
+   sctx->flags |=
+      SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+   si_make_CB_shader_coherent(sctx, tex->nr_samples, true,
+                              true /* DCC is not possible with image stores */);
+
+   /* Save states. */
+   void *saved_cs = sctx->cs_shader_state.program;
+   struct pipe_image_view saved_image = {0};
+   util_copy_image_view(&saved_image, &sctx->images[PIPE_SHADER_COMPUTE].views[0]);
+
+   /* Bind the image. */
+   struct pipe_image_view image = {0};
+   image.resource = tex;
+   /* Don't set WRITE so as not to trigger FMASK expansion, causing
+    * an infinite loop. */
+   image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ;
+   image.format = util_format_linear(tex->format);
+   if (is_array)
+      image.u.tex.last_layer = tex->array_size - 1;
+
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
+
+   /* Bind the shader. */
+   void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array];
+   if (!*shader)
+      *shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array);
+   ctx->bind_compute_state(ctx, *shader);
+
+   /* Dispatch compute. */
+   struct pipe_grid_info info = {0};
+   info.block[0] = 8;
+   info.last_block[0] = tex->width0 % 8;
+   info.block[1] = 8;
+   info.last_block[1] = tex->height0 % 8;
+   info.block[2] = 1;
+   info.grid[0] = DIV_ROUND_UP(tex->width0, 8);
+   info.grid[1] = DIV_ROUND_UP(tex->height0, 8);
+   info.grid[2] = is_array ? tex->array_size : 1;
+
+   si_launch_grid_internal(sctx, &info);
+
+   /* Flush caches and sync engines. */
+   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
+                  si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+
+   /* Restore previous states. */
+   ctx->bind_compute_state(ctx, saved_cs);
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
+   pipe_resource_reference(&saved_image.resource, NULL);
+
+   /* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */
  #define INVALID 0 /* never used */
-       static const uint64_t fmask_expand_values[][4] = {
-               /* samples */
-               /* 2 (8 bpp) 4 (8 bpp)   8 (8-32bpp) 16 (16-64bpp)      fragments */
-               {0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE},         /* 1 */
-               {0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4},         /* 2 */
-               {INVALID,    0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */
-               {INVALID,    INVALID,    0x76543210, 0x8888888876543210}, /* 8 */
-       };
-
-       /* Clear FMASK to identity. */
-       struct si_texture *stex = (struct si_texture*)tex;
-       si_clear_buffer(sctx, tex, stex->surface.fmask_offset, stex->surface.fmask_size,
-                       (uint32_t*)&fmask_expand_values[log_fragments][log_samples - 1],
-                       4, SI_COHERENCY_SHADER, false);
+   static const uint64_t fmask_expand_values[][4] = {
+      /* samples */
+      /* 2 (8 bpp) 4 (8 bpp)   8 (8-32bpp) 16 (16-64bpp)      fragments */
+      {0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE},      /* 1 */
+      {0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4},      /* 2 */
+      {INVALID, 0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */
+      {INVALID, INVALID, 0x76543210, 0x8888888876543210},    /* 8 */
+   };
+
+   /* Clear FMASK to identity. */
+   struct si_texture *stex = (struct si_texture *)tex;
+   si_clear_buffer(sctx, tex, stex->surface.fmask_offset, stex->surface.fmask_size,
+                   (uint32_t *)&fmask_expand_values[log_fragments][log_samples - 1], 4,
+                   SI_COHERENCY_SHADER, false);
  }
  
  void si_init_compute_blit_functions(struct si_context *sctx)
  {
-       sctx->b.clear_buffer = si_pipe_clear_buffer;
+   sctx->b.clear_buffer = si_pipe_clear_buffer;
  }
  
  /* Clear a region of a color surface to a constant value. */
-void si_compute_clear_render_target(struct pipe_context *ctx,
-                                   struct pipe_surface *dstsurf,
-                                   const union pipe_color_union *color,
-                                   unsigned dstx, unsigned dsty,
-                                   unsigned width, unsigned height,
-                                   bool render_condition_enabled)
+void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf,
+                                    const union pipe_color_union *color, unsigned dstx,
+                                    unsigned dsty, unsigned width, unsigned height,
+                                    bool render_condition_enabled)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1;
-       unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0};
-
-       if (width == 0 || height == 0)
-               return;
-
-       /* The driver doesn't decompress resources automatically here. */
-       si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA,
-                                 dstsurf->u.tex.level, dstsurf->u.tex.first_layer,
-                                 dstsurf->u.tex.last_layer);
-
-       if (util_format_is_srgb(dstsurf->format)) {
-               union pipe_color_union color_srgb;
-               for (int i = 0; i < 3; i++)
-                       color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
-               color_srgb.f[3] = color->f[3];
-               memcpy(data + 4, color_srgb.ui, sizeof(color->ui));
-       } else {
-               memcpy(data + 4, color->ui, sizeof(color->ui));
-       }
-
-       sctx->render_cond_force_off = !render_condition_enabled;
-
-       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-       si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true,
-                                  true /* DCC is not possible with image stores */);
-
-       struct pipe_constant_buffer saved_cb = {};
-       si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
-
-       struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
-       struct pipe_image_view saved_image = {0};
-       util_copy_image_view(&saved_image, &images->views[0]);
-
-       void *saved_cs = sctx->cs_shader_state.program;
-
-       struct pipe_constant_buffer cb = {};
-       cb.buffer_size = sizeof(data);
-       cb.user_buffer = data;
-       ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
-
-       struct pipe_image_view image = {0};
-       image.resource = dstsurf->texture;
-       image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE;
-       image.format = util_format_linear(dstsurf->format);
-       image.u.tex.level = dstsurf->u.tex.level;
-       image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */
-       image.u.tex.last_layer = dstsurf->u.tex.last_layer;
-
-       ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
-
-       struct pipe_grid_info info = {0};
-
-       if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) {
-               if (!sctx->cs_clear_render_target)
-                       sctx->cs_clear_render_target = si_clear_render_target_shader(ctx);
-               ctx->bind_compute_state(ctx, sctx->cs_clear_render_target);
-               info.block[0] = 8;
-               info.last_block[0] = width % 8;
-               info.block[1] = 8;
-               info.last_block[1] = height % 8;
-               info.block[2] = 1;
-               info.grid[0] = DIV_ROUND_UP(width, 8);
-               info.grid[1] = DIV_ROUND_UP(height, 8);
-               info.grid[2] = num_layers;
-       } else {
-               if (!sctx->cs_clear_render_target_1d_array)
-                       sctx->cs_clear_render_target_1d_array =
-                               si_clear_render_target_shader_1d_array(ctx);
-               ctx->bind_compute_state(ctx, sctx->cs_clear_render_target_1d_array);
-               info.block[0] = 64;
-               info.last_block[0] = width % 64;
-               info.block[1] = 1;
-               info.block[2] = 1;
-               info.grid[0] = DIV_ROUND_UP(width, 64);
-               info.grid[1] = num_layers;
-               info.grid[2] = 1;
-       }
-
-       si_launch_grid_internal(sctx, &info);
-
-       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
-                      si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-       ctx->bind_compute_state(ctx, saved_cs);
-       ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
-       ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
-       pipe_resource_reference(&saved_image.resource, NULL);
-       pipe_resource_reference(&saved_cb.buffer, NULL);
+   struct si_context *sctx = (struct si_context *)ctx;
+   unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1;
+   unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0};
+
+   if (width == 0 || height == 0)
+      return;
+
+   /* The driver doesn't decompress resources automatically here. */
+   si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA, dstsurf->u.tex.level,
+                             dstsurf->u.tex.first_layer, dstsurf->u.tex.last_layer);
+
+   if (util_format_is_srgb(dstsurf->format)) {
+      union pipe_color_union color_srgb;
+      for (int i = 0; i < 3; i++)
+         color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
+      color_srgb.f[3] = color->f[3];
+      memcpy(data + 4, color_srgb.ui, sizeof(color->ui));
+   } else {
+      memcpy(data + 4, color->ui, sizeof(color->ui));
+   }
+
+   sctx->render_cond_force_off = !render_condition_enabled;
+
+   sctx->flags |=
+      SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+   si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true,
+                              true /* DCC is not possible with image stores */);
+
+   struct pipe_constant_buffer saved_cb = {};
+   si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+
+   struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
+   struct pipe_image_view saved_image = {0};
+   util_copy_image_view(&saved_image, &images->views[0]);
+
+   void *saved_cs = sctx->cs_shader_state.program;
+
+   struct pipe_constant_buffer cb = {};
+   cb.buffer_size = sizeof(data);
+   cb.user_buffer = data;
+   ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
+
+   struct pipe_image_view image = {0};
+   image.resource = dstsurf->texture;
+   image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE;
+   image.format = util_format_linear(dstsurf->format);
+   image.u.tex.level = dstsurf->u.tex.level;
+   image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */
+   image.u.tex.last_layer = dstsurf->u.tex.last_layer;
+
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
+
+   struct pipe_grid_info info = {0};
+
+   if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) {
+      if (!sctx->cs_clear_render_target)
+         sctx->cs_clear_render_target = si_clear_render_target_shader(ctx);
+      ctx->bind_compute_state(ctx, sctx->cs_clear_render_target);
+      info.block[0] = 8;
+      info.last_block[0] = width % 8;
+      info.block[1] = 8;
+      info.last_block[1] = height % 8;
+      info.block[2] = 1;
+      info.grid[0] = DIV_ROUND_UP(width, 8);
+      info.grid[1] = DIV_ROUND_UP(height, 8);
+      info.grid[2] = num_layers;
+   } else {
+      if (!sctx->cs_clear_render_target_1d_array)
+         sctx->cs_clear_render_target_1d_array = si_clear_render_target_shader_1d_array(ctx);
+      ctx->bind_compute_state(ctx, sctx->cs_clear_render_target_1d_array);
+      info.block[0] = 64;
+      info.last_block[0] = width % 64;
+      info.block[1] = 1;
+      info.block[2] = 1;
+      info.grid[0] = DIV_ROUND_UP(width, 64);
+      info.grid[1] = num_layers;
+      info.grid[2] = 1;
+   }
+
+   si_launch_grid_internal(sctx, &info);
+
+   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
+                  si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+   ctx->bind_compute_state(ctx, saved_cs);
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
+   ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+   pipe_resource_reference(&saved_image.resource, NULL);
+   pipe_resource_reference(&saved_cb.buffer, NULL);
  }
diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c

index 7f985ad3c62d3a90a0ae4d72c1ee34521b811715..389233835eb4299600f3d317e1579b766b05b582 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -23,16 +23,15 @@
   *
   */
  
+#include "ac_llvm_cull.h"
+#include "si_build_pm4.h"
  #include "si_pipe.h"
  #include "si_shader_internal.h"
  #include "sid.h"
-#include "si_build_pm4.h"
-#include "ac_llvm_cull.h"
-
+#include "util/fast_idiv_by_const.h"
  #include "util/u_prim.h"
  #include "util/u_suballoc.h"
  #include "util/u_upload_mgr.h"
-#include "util/fast_idiv_by_const.h"
  
  /* Based on:
   * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
@@ -154,1453 +153,1354 @@
  
  /* At least 256 is needed for the fastest wave launch rate from compute queues
   * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
-#define THREADGROUP_SIZE               256 /* high numbers limit available VGPRs */
-#define THREADGROUPS_PER_CU            1 /* TGs to launch on 1 CU before going onto the next, max 8 */
-#define MAX_WAVES_PER_SH               0 /* no limit */
-#define INDEX_STORES_USE_SLC           1 /* don't cache indices if L2 is full */
+#define THREADGROUP_SIZE     256 /* high numbers limit available VGPRs */
+#define THREADGROUPS_PER_CU  1   /* TGs to launch on 1 CU before going onto the next, max 8 */
+#define MAX_WAVES_PER_SH     0   /* no limit */
+#define INDEX_STORES_USE_SLC 1   /* don't cache indices if L2 is full */
  /* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */
-#define CULL_Z                         0
+#define CULL_Z 0
  /* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */
-#define VERTEX_COUNTER_GDS_MODE                2
-#define GDS_SIZE_UNORDERED             (4 * 1024) /* only for the unordered GDS counter */
+#define VERTEX_COUNTER_GDS_MODE 2
+#define GDS_SIZE_UNORDERED      (4 * 1024) /* only for the unordered GDS counter */
  
  /* Grouping compute dispatches for small draw calls: How many primitives from multiple
   * draw calls to process by compute before signaling the gfx IB. This reduces the number
   * of EOP events + REWIND packets, because they decrease performance. */
-#define PRIMS_PER_BATCH                        (512 * 1024)
+#define PRIMS_PER_BATCH (512 * 1024)
  /* Draw call splitting at the packet level. This allows signaling the gfx IB
   * for big draw calls sooner, but doesn't allow context flushes between packets.
   * Primitive restart is supported. Only implemented for ordered append. */
-#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH
+#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH
  /* If there is not enough ring buffer space for the current IB, split draw calls into
   * this number of primitives, so that we can flush the context and get free ring space. */
-#define SPLIT_PRIMS_DRAW_LEVEL         PRIMS_PER_BATCH
+#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH
  
  /* Derived values. */
-#define WAVES_PER_TG                   DIV_ROUND_UP(THREADGROUP_SIZE, 64)
-#define SPLIT_PRIMS_PACKET_LEVEL       (VERTEX_COUNTER_GDS_MODE == 2 ? \
-                                        SPLIT_PRIMS_PACKET_LEVEL_VALUE : \
-                                        UINT_MAX & ~(THREADGROUP_SIZE - 1))
+#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
+#define SPLIT_PRIMS_PACKET_LEVEL                                                                   \
+   (VERTEX_COUNTER_GDS_MODE == 2 ? SPLIT_PRIMS_PACKET_LEVEL_VALUE                                  \
+                                 : UINT_MAX & ~(THREADGROUP_SIZE - 1))
  
-#define REWIND_SIGNAL_BIT              0x80000000
+#define REWIND_SIGNAL_BIT 0x80000000
  /* For emulating the rewind packet on CI. */
-#define FORCE_REWIND_EMULATION         0
+#define FORCE_REWIND_EMULATION 0
  
-void si_initialize_prim_discard_tunables(struct si_screen *sscreen,
-                                        bool is_aux_context,
-                                        unsigned *prim_discard_vertex_count_threshold,
-                                        unsigned *index_ring_size_per_ib)
+void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
+                                         unsigned *prim_discard_vertex_count_threshold,
+                                         unsigned *index_ring_size_per_ib)
  {
-       *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
-
-       if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */
-           !sscreen->info.has_gds_ordered_append ||
-           sscreen->debug_flags & DBG(NO_PD) ||
-           is_aux_context)
-               return;
-
-       /* TODO: enable this after the GDS kernel memory management is fixed */
-       bool enable_on_pro_graphics_by_default = false;
-
-       if (sscreen->debug_flags & DBG(ALWAYS_PD) ||
-           sscreen->debug_flags & DBG(PD) ||
-           (enable_on_pro_graphics_by_default &&
-            sscreen->info.is_pro_graphics &&
-            (sscreen->info.family == CHIP_BONAIRE ||
-             sscreen->info.family == CHIP_HAWAII ||
-             sscreen->info.family == CHIP_TONGA ||
-             sscreen->info.family == CHIP_FIJI ||
-             sscreen->info.family == CHIP_POLARIS10 ||
-             sscreen->info.family == CHIP_POLARIS11 ||
-             sscreen->info.family == CHIP_VEGA10 ||
-             sscreen->info.family == CHIP_VEGA20))) {
-               *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
-
-               if (sscreen->debug_flags & DBG(ALWAYS_PD))
-                       *prim_discard_vertex_count_threshold = 0; /* always enable */
-
-               const uint32_t MB = 1024 * 1024;
-               const uint64_t GB = 1024 * 1024 * 1024;
-
-               /* The total size is double this per context.
-                * Greater numbers allow bigger gfx IBs.
-                */
-               if (sscreen->info.vram_size <= 2 * GB)
-                       *index_ring_size_per_ib = 64 * MB;
-               else if (sscreen->info.vram_size <= 4 * GB)
-                       *index_ring_size_per_ib = 128 * MB;
-               else
-                       *index_ring_size_per_ib = 256 * MB;
-       }
+   *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
+
+   if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */
+       !sscreen->info.has_gds_ordered_append || sscreen->debug_flags & DBG(NO_PD) || is_aux_context)
+      return;
+
+   /* TODO: enable this after the GDS kernel memory management is fixed */
+   bool enable_on_pro_graphics_by_default = false;
+
+   if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) ||
+       (enable_on_pro_graphics_by_default && sscreen->info.is_pro_graphics &&
+        (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_HAWAII ||
+         sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI ||
+         sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 ||
+         sscreen->info.family == CHIP_VEGA10 || sscreen->info.family == CHIP_VEGA20))) {
+      *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
+
+      if (sscreen->debug_flags & DBG(ALWAYS_PD))
+         *prim_discard_vertex_count_threshold = 0; /* always enable */
+
+      const uint32_t MB = 1024 * 1024;
+      const uint64_t GB = 1024 * 1024 * 1024;
+
+      /* The total size is double this per context.
+       * Greater numbers allow bigger gfx IBs.
+       */
+      if (sscreen->info.vram_size <= 2 * GB)
+         *index_ring_size_per_ib = 64 * MB;
+      else if (sscreen->info.vram_size <= 4 * GB)
+         *index_ring_size_per_ib = 128 * MB;
+      else
+         *index_ring_size_per_ib = 256 * MB;
+   }
  }
  
  /* Opcode can be "add" or "swap". */
-static LLVMValueRef
-si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
-                      LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index,
-                      bool release, bool done)
+static LLVMValueRef si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
+                                           LLVMValueRef m0, LLVMValueRef value,
+                                           unsigned ordered_count_index, bool release, bool done)
  {
-       if (ctx->screen->info.chip_class >= GFX10)
-               ordered_count_index |= 1 << 24; /* number of dwords == 1 */
-
-       LLVMValueRef args[] = {
-               LLVMBuildIntToPtr(ctx->ac.builder, m0,
-                                 LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""),
-               value,
-               LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
-               ctx->ac.i32_0, /* scope */
-               ctx->ac.i1false, /* volatile */
-               LLVMConstInt(ctx->ac.i32, ordered_count_index, 0),
-               LLVMConstInt(ctx->ac.i1, release, 0),
-               LLVMConstInt(ctx->ac.i1, done, 0),
-       };
-
-       char intrinsic[64];
-       snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
-       return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0);
+   if (ctx->screen->info.chip_class >= GFX10)
+      ordered_count_index |= 1 << 24; /* number of dwords == 1 */
+
+   LLVMValueRef args[] = {
+      LLVMBuildIntToPtr(ctx->ac.builder, m0, LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""),
+      value,
+      LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
+      ctx->ac.i32_0,                                             /* scope */
+      ctx->ac.i1false,                                           /* volatile */
+      LLVMConstInt(ctx->ac.i32, ordered_count_index, 0),
+      LLVMConstInt(ctx->ac.i1, release, 0),
+      LLVMConstInt(ctx->ac.i1, done, 0),
+   };
+
+   char intrinsic[64];
+   snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
+   return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0);
  }
  
  static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
  {
-       uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
-       ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");
-       ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");
-       return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
-                                LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");
+   uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
+   ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");
+   ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");
+   return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
+                            LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");
  }
  
  struct si_thread0_section {
-       struct si_shader_context *ctx;
-       LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
-       LLVMValueRef saved_exec;
+   struct si_shader_context *ctx;
+   LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
+   LLVMValueRef saved_exec;
  };
  
  /* Enter a section that only executes on thread 0. */
  static void si_enter_thread0_section(struct si_shader_context *ctx,
-                                    struct si_thread0_section *section,
-                                    LLVMValueRef thread_id)
+                                     struct si_thread0_section *section, LLVMValueRef thread_id)
  {
-       section->ctx = ctx;
-       section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");
-
-       /* This IF has 4 instructions:
-        *   v_and_b32_e32 v, 63, v         ; get the thread ID
-        *   v_cmp_eq_u32_e32 vcc, 0, v     ; thread ID == 0
-        *   s_and_saveexec_b64 s, vcc
-        *   s_cbranch_execz BB0_4
-        *
-        * It could just be s_and_saveexec_b64 s, 1.
-        */
-       ac_build_ifcc(&ctx->ac,
-                     LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id,
-                                   ctx->ac.i32_0, ""), 12601);
+   section->ctx = ctx;
+   section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");
+
+   /* This IF has 4 instructions:
+    *   v_and_b32_e32 v, 63, v         ; get the thread ID
+    *   v_cmp_eq_u32_e32 vcc, 0, v     ; thread ID == 0
+    *   s_and_saveexec_b64 s, vcc
+    *   s_cbranch_execz BB0_4
+    *
+    * It could just be s_and_saveexec_b64 s, 1.
+    */
+   ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, ctx->ac.i32_0, ""),
+                 12601);
  }
  
  /* Exit a section that only executes on thread 0 and broadcast the result
   * to all threads. */
-static void si_exit_thread0_section(struct si_thread0_section *section,
-                                   LLVMValueRef *result)
+static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValueRef *result)
  {
-       struct si_shader_context *ctx = section->ctx;
+   struct si_shader_context *ctx = section->ctx;
  
-       LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
+   LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
  
-       ac_build_endif(&ctx->ac, 12601);
+   ac_build_endif(&ctx->ac, 12601);
  
-       /* Broadcast the result from thread 0 to all threads. */
-       *result = ac_build_readlane(&ctx->ac,
-                       LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
+   /* Broadcast the result from thread 0 to all threads. */
+   *result =
+      ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
  }
  
  void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
  {
-       struct si_shader_key *key = &ctx->shader->key;
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef vs = ctx->main_fn;
-
-       /* Always inline the VS function. */
-       ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
-       LLVMSetLinkage(vs, LLVMPrivateLinkage);
-
-       enum ac_arg_type const_desc_type;
-       if (ctx->shader->selector->info.const_buffers_declared == 1 &&
-           ctx->shader->selector->info.shader_buffers_declared == 0)
-               const_desc_type = AC_ARG_CONST_FLOAT_PTR;
-       else
-               const_desc_type = AC_ARG_CONST_DESC_PTR;
-
-       memset(&ctx->args, 0, sizeof(ctx->args));
-
-       struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
-       struct ac_arg param_vb_desc, param_const_desc;
-       struct ac_arg param_base_vertex, param_start_instance;
-       struct ac_arg param_block_id, param_local_id, param_ordered_wave_id;
-       struct ac_arg param_restart_index, param_smallprim_precision;
-       struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
-       struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr;
-
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
-                  &param_index_buffers_and_constants);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_counter);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_last_wave_prim_id);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_count_addr);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
-                  &param_vb_desc);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type,
-                  &param_const_desc);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
-                  &param_sampler_desc);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_base_vertex);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_instance);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_multiplier);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_terms);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_restart_index);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &param_smallprim_precision);
-
-       /* Block ID and thread ID inputs. */
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_block_id);
-       if (VERTEX_COUNTER_GDS_MODE == 2)
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_ordered_wave_id);
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &param_local_id);
-
-       /* Create the compute shader function. */
-       unsigned old_type = ctx->type;
-       ctx->type = PIPE_SHADER_COMPUTE;
-       si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
-       ctx->type = old_type;
-
-       if (VERTEX_COUNTER_GDS_MODE == 2) {
-               ac_llvm_add_target_dep_function_attr(ctx->main_fn,
-                                                    "amdgpu-gds-size", 256);
-       } else if (VERTEX_COUNTER_GDS_MODE == 1) {
-               ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size",
-                                                    GDS_SIZE_UNORDERED);
-       }
-
-       /* Assemble parameters for VS. */
-       LLVMValueRef vs_params[16];
-       unsigned num_vs_params = 0;
-       unsigned param_vertex_id, param_instance_id;
-
-       vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
-       vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
-       vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
-       vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
-       vs_params[num_vs_params++] = LLVMConstInt(ctx->ac.i32,
-                                       S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
-       vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
-       vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
-       vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
-       vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
-
-       vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
-       vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
-       vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */
-       vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */
-
-       assert(num_vs_params <= ARRAY_SIZE(vs_params));
-       assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
-
-       /* Load descriptors. (load 8 dwords at once) */
-       LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
-
-       LLVMValueRef index_buffers_and_constants = ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
-       tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
-                                  ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
-       tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
-
-       for (unsigned i = 0; i < 8; i++)
-               desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
-
-       input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
-       output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
-
-       /* Compute PrimID and InstanceID. */
-       LLVMValueRef global_thread_id =
-               ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
-                             LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
-                             ac_get_arg(&ctx->ac, param_local_id));
-       LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
-       LLVMValueRef instance_id = ctx->ac.i32_0;
-
-       if (key->opt.cs_instancing) {
-               LLVMValueRef num_prims_udiv_terms =
-                       ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
-               LLVMValueRef num_prims_udiv_multiplier =
-                       ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
-               /* Unpack num_prims_udiv_terms. */
-               LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms,
-                                                      LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
-               LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms,
-                                                               LLVMConstInt(ctx->ac.i32, 5, 0), "");
-               /* Divide the total prim_id by the number of prims per instance. */
-               instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id,
-                                                              num_prims_udiv_multiplier,
-                                                              post_shift);
-               /* Compute the remainder. */
-               prim_id = LLVMBuildSub(builder, prim_id,
-                                      LLVMBuildMul(builder, instance_id,
-                                                   prims_per_instance, ""), "");
-       }
-
-       /* Generate indices (like a non-indexed draw call). */
-       LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
-       unsigned vertices_per_prim = 3;
-
-       switch (key->opt.cs_prim_type) {
-       case PIPE_PRIM_TRIANGLES:
-               for (unsigned i = 0; i < 3; i++) {
-                       index[i] = ac_build_imad(&ctx->ac, prim_id,
-                                                LLVMConstInt(ctx->ac.i32, 3, 0),
-                                                LLVMConstInt(ctx->ac.i32, i, 0));
-               }
-               break;
-       case PIPE_PRIM_TRIANGLE_STRIP:
-               for (unsigned i = 0; i < 3; i++) {
-                       index[i] = LLVMBuildAdd(builder, prim_id,
-                                               LLVMConstInt(ctx->ac.i32, i, 0), "");
-               }
-               break;
-       case PIPE_PRIM_TRIANGLE_FAN:
-               /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
-                * and rasterizer as a normal triangle, so we need to put the provoking
-                * vertex into the correct index variable and preserve orientation at the same time.
-                * gl_VertexID is preserved, because it's equal to the index.
-                */
-               if (key->opt.cs_provoking_vertex_first) {
-                       index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
-                       index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
-                       index[2] = ctx->ac.i32_0;
-               } else {
-                       index[0] = ctx->ac.i32_0;
-                       index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
-                       index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
-               }
-               break;
-       default:
-               unreachable("unexpected primitive type");
-       }
-
-       /* Fetch indices. */
-       if (key->opt.cs_indexed) {
-               for (unsigned i = 0; i < 3; i++) {
-                       index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf,
-                                                              index[i], ctx->ac.i32_0, 1,
-                                                              0, true);
-                       index[i] = ac_to_integer(&ctx->ac, index[i]);
-               }
-       }
-
-       LLVMValueRef ordered_wave_id = NULL;
-
-       /* Extract the ordered wave ID. */
-       if (VERTEX_COUNTER_GDS_MODE == 2) {
-               ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id);
-               ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id,
-                                               LLVMConstInt(ctx->ac.i32, 6, 0), "");
-               ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id,
-                                              LLVMConstInt(ctx->ac.i32, 0xfff, 0), "");
-       }
-       LLVMValueRef thread_id =
-               LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
-                            LLVMConstInt(ctx->ac.i32, 63, 0), "");
-
-       /* Every other triangle in a strip has a reversed vertex order, so we
-        * need to swap vertices of odd primitives to get the correct primitive
-        * orientation when converting triangle strips to triangles. Primitive
-        * restart complicates it, because a strip can start anywhere.
-        */
-       LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
-       LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
-
-       if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
-               /* Without primitive restart, odd primitives have reversed orientation.
-                * Only primitive restart can flip it with respect to the first vertex
-                * of the draw call.
-                */
-               LLVMValueRef first_is_odd = ctx->ac.i1false;
-
-               /* Handle primitive restart. */
-               if (key->opt.cs_primitive_restart) {
-                       /* Get the GDS primitive restart continue flag and clear
-                        * the flag in vertex_counter. This flag is used when the draw
-                        * call was split and we need to load the primitive orientation
-                        * flag from GDS for the first wave too.
-                        */
-                       LLVMValueRef gds_prim_restart_continue =
-                               LLVMBuildLShr(builder, vertex_counter,
-                                             LLVMConstInt(ctx->ac.i32, 31, 0), "");
-                       gds_prim_restart_continue =
-                               LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, "");
-                       vertex_counter = LLVMBuildAnd(builder, vertex_counter,
-                                                     LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), "");
-
-                       LLVMValueRef index0_is_reset;
-
-                       for (unsigned i = 0; i < 3; i++) {
-                               LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
-                                                                      ac_get_arg(&ctx->ac, param_restart_index),
-                                                                      "");
-                               if (i == 0)
-                                       index0_is_reset = LLVMBuildNot(builder, not_reset, "");
-                               prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted,
-                                                                    not_reset, "");
-                       }
-
-                       /* If the previous waves flip the primitive orientation
-                        * of the current triangle strip, it will be stored in GDS.
-                        *
-                        * Sometimes the correct orientation is not needed, in which case
-                        * we don't need to execute this.
-                        */
-                       if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
-                               /* If there are reset indices in this wave, get the thread index
-                                * where the most recent strip starts relative to each thread.
-                                */
-                               LLVMValueRef preceding_threads_mask =
-                                       LLVMBuildSub(builder,
-                                                    LLVMBuildShl(builder, ctx->ac.i64_1,
-                                                                 LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""),
-                                                    ctx->ac.i64_1, "");
-
-                               LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
-                               LLVMValueRef preceding_reset_threadmask =
-                                       LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
-                               LLVMValueRef strip_start =
-                                       ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
-                               strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, "");
-
-                               /* This flips the orientatino based on reset indices within this wave only. */
-                               first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, "");
-
-                               LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
-                               LLVMValueRef is_first_wave, current_wave_resets_index;
-
-                               /* Get the thread index where the last strip starts in this wave.
-                                *
-                                * If the last strip doesn't start in this wave, the thread index
-                                * will be 0.
-                                *
-                                * If the last strip starts in the next wave, the thread index will
-                                * be 64.
-                                */
-                               last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
-                               last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, "");
-
-                               struct si_thread0_section section;
-                               si_enter_thread0_section(ctx, &section, thread_id);
-
-                               /* This must be done in the thread 0 section, because
-                                * we expect PrimID to be 0 for the whole first wave
-                                * in this expression.
-                                *
-                                * NOTE: This will need to be different if we wanna support
-                                * instancing with primitive restart.
-                                */
-                               is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, "");
-                               is_first_wave = LLVMBuildAnd(builder, is_first_wave,
-                                                            LLVMBuildNot(builder,
-                                                                         gds_prim_restart_continue, ""), "");
-                               current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE,
-                                                                         last_strip_start, ctx->ac.i32_0, "");
-
-                               ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state");
-
-                               /* Save the last strip start primitive index in GDS and read
-                                * the value that previous waves stored.
-                                *
-                                * if (is_first_wave || current_wave_resets_strip)
-                                *    // Read the value that previous waves stored and store a new one.
-                                *    first_is_odd = ds.ordered.swap(last_strip_start);
-                                * else
-                                *    // Just read the value that previous waves stored.
-                                *    first_is_odd = ds.ordered.add(0);
-                                */
-                               ac_build_ifcc(&ctx->ac,
-                                             LLVMBuildOr(builder, is_first_wave,
-                                                         current_wave_resets_index, ""), 12602);
-                               {
-                                       /* The GDS address is always 0 with ordered append. */
-                                       tmp = si_build_ds_ordered_op(ctx, "swap",
-                                                                    ordered_wave_id, last_strip_start,
-                                                                    1, true, false);
-                                       LLVMBuildStore(builder, tmp, ret);
-                               }
-                               ac_build_else(&ctx->ac, 12603);
-                               {
-                                       /* Just read the value from GDS. */
-                                       tmp = si_build_ds_ordered_op(ctx, "add",
-                                                                    ordered_wave_id, ctx->ac.i32_0,
-                                                                    1, true, false);
-                                       LLVMBuildStore(builder, tmp, ret);
-                               }
-                               ac_build_endif(&ctx->ac, 12602);
-
-                               prev_wave_state = LLVMBuildLoad(builder, ret, "");
-                               /* Ignore the return value if this is the first wave. */
-                               prev_wave_state = LLVMBuildSelect(builder, is_first_wave,
-                                                                 ctx->ac.i32_0, prev_wave_state, "");
-                               si_exit_thread0_section(&section, &prev_wave_state);
-                               prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, "");
-
-                               /* If the strip start appears to be on thread 0 for the current primitive
-                                * (meaning the reset index is not present in this wave and might have
-                                * appeared in previous waves), use the value from GDS to determine
-                                * primitive orientation.
-                                *
-                                * If the strip start is in this wave for the current primitive, use
-                                * the value from the current wave to determine primitive orientation.
-                                */
-                               LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ,
-                                                                            strip_start, ctx->ac.i32_0, "");
-                               first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state,
-                                                              first_is_odd, "");
-                       }
-               }
-               /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
-               LLVMValueRef prim_is_odd =
-                       LLVMBuildXor(builder, first_is_odd,
-                                    LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
-
-               /* Convert triangle strip indices to triangle indices. */
-               ac_build_triangle_strip_indices_to_triangle(&ctx->ac, prim_is_odd,
-                                                           LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
-                                                           index);
-       }
-
-       /* Execute the vertex shader for each vertex to get vertex positions. */
-       LLVMValueRef pos[3][4];
-       for (unsigned i = 0; i < vertices_per_prim; i++) {
-               vs_params[param_vertex_id] = index[i];
-               vs_params[param_instance_id] = instance_id;
-
-               LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
-               for (unsigned chan = 0; chan < 4; chan++)
-                       pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
-       }
-
-       /* Divide XYZ by W. */
-       for (unsigned i = 0; i < vertices_per_prim; i++) {
-               for (unsigned chan = 0; chan < 3; chan++)
-                       pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
-       }
-
-       /* Load the viewport state. */
-       LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
-                                                 LLVMConstInt(ctx->ac.i32, 2, 0));
-       vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
-       LLVMValueRef vp_scale[2], vp_translate[2];
-       vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
-       vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
-       vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
-       vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
-
-       /* Do culling. */
-       struct ac_cull_options options = {};
-       options.cull_front = key->opt.cs_cull_front;
-       options.cull_back = key->opt.cs_cull_back;
-       options.cull_view_xy = true;
-       options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
-       options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
-       options.cull_small_prims = true;
-       options.cull_zero_area = true;
-       options.cull_w = true;
-       options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
-
-       LLVMValueRef accepted =
-               ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted,
-                                vp_scale, vp_translate,
-                                ac_get_arg(&ctx->ac, param_smallprim_precision),
-                                &options);
-
-       ac_build_optimization_barrier(&ctx->ac, &accepted);
-       LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
-
-       /* Count the number of active threads by doing bitcount(accepted). */
-       LLVMValueRef num_prims_accepted =
-               ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->ac.i64,
-                                  &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
-       num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
-
-       LLVMValueRef start;
-
-       /* Execute atomic_add on the vertex count. */
-       struct si_thread0_section section;
-       si_enter_thread0_section(ctx, &section, thread_id);
-       {
-               if (VERTEX_COUNTER_GDS_MODE == 0) {
-                       LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
-                                               LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-                       vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
-                       start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
-                                                  vertex_counter, num_indices,
-                                                  LLVMAtomicOrderingMonotonic, false);
-               } else if (VERTEX_COUNTER_GDS_MODE == 1) {
-                       LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
-                                               LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-                       vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
-                                                          LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), "");
-                       start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
-                                                  vertex_counter, num_indices,
-                                                  LLVMAtomicOrderingMonotonic, false);
-               } else if (VERTEX_COUNTER_GDS_MODE == 2) {
-                       LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
-                       /* If the draw call was split into multiple subdraws, each using
-                        * a separate draw packet, we need to start counting from 0 for
-                        * the first compute wave of the subdraw.
-                        *
-                        * vertex_counter contains the primitive ID of the first thread
-                        * in the first wave.
-                        *
-                        * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
-                        */
-                       LLVMValueRef is_first_wave =
-                               LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
-                                             vertex_counter, "");
-
-                       /* Store the primitive count for ordered append, not vertex count.
-                        * The idea is to avoid GDS initialization via CP DMA. The shader
-                        * effectively stores the first count using "swap".
-                        *
-                        * if (first_wave) {
-                        *    ds.ordered.swap(num_prims_accepted); // store the first primitive count
-                        *    previous = 0;
-                        * } else {
-                        *    previous = ds.ordered.add(num_prims_accepted) // add the primitive count
-                        * }
-                        */
-                       ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
-                       {
-                               /* The GDS address is always 0 with ordered append. */
-                               si_build_ds_ordered_op(ctx, "swap", ordered_wave_id,
-                                                      num_prims_accepted, 0, true, true);
-                               LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store);
-                       }
-                       ac_build_else(&ctx->ac, 12605);
-                       {
-                               LLVMBuildStore(builder,
-                                              si_build_ds_ordered_op(ctx, "add", ordered_wave_id,
-                                                                     num_prims_accepted, 0,
-                                                                     true, true),
-                                              tmp_store);
-                       }
-                       ac_build_endif(&ctx->ac, 12604);
-
-                       start = LLVMBuildLoad(builder, tmp_store, "");
-               }
-       }
-       si_exit_thread0_section(&section, &start);
-
-       /* Write the final vertex count to memory. An EOS/EOP event could do this,
-        * but those events are super slow and should be avoided if performance
-        * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
-        * event like this.
-        */
-       if (VERTEX_COUNTER_GDS_MODE == 2) {
-               ac_build_ifcc(&ctx->ac,
-                             LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
-                                           ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""),
-                             12606);
-               LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
-               count = LLVMBuildMul(builder, count,
-                                    LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-
-               /* GFX8 needs to disable caching, so that the CP can see the stored value.
-                * MTYPE=3 bypasses TC L2.
-                */
-               if (ctx->screen->info.chip_class <= GFX8) {
-                       LLVMValueRef desc[] = {
-                               ac_get_arg(&ctx->ac, param_vertex_count_addr),
-                               LLVMConstInt(ctx->ac.i32,
-                                       S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
-                               LLVMConstInt(ctx->ac.i32, 4, 0),
-                               LLVMConstInt(ctx->ac.i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
-                                                      S_008F0C_MTYPE(3 /* uncached */), 0),
-                       };
-                       LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
-                       ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0,
-                                                   ctx->ac.i32_0, 0, ac_glc | ac_slc);
-               } else {
-                       LLVMBuildStore(builder, count,
-                                      si_expand_32bit_pointer(ctx,
-                                                              ac_get_arg(&ctx->ac,
-                                                                         param_vertex_count_addr)));
-               }
-               ac_build_endif(&ctx->ac, 12606);
-       } else {
-               /* For unordered modes that increment a vertex count instead of
-                * primitive count, convert it into the primitive index.
-                */
-               start = LLVMBuildUDiv(builder, start,
-                                     LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-       }
-
-       /* Now we need to store the indices of accepted primitives into
-        * the output index buffer.
-        */
-       ac_build_ifcc(&ctx->ac, accepted, 16607);
-       {
-               /* Get the number of bits set before the index of this thread. */
-               LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
-
-               /* We have lowered instancing. Pack the instance ID into vertex ID. */
-               if (key->opt.cs_instancing) {
-                       instance_id = LLVMBuildShl(builder, instance_id,
-                                                  LLVMConstInt(ctx->ac.i32, 16, 0), "");
-
-                       for (unsigned i = 0; i < vertices_per_prim; i++)
-                               index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
-               }
-
-               if (VERTEX_COUNTER_GDS_MODE == 2) {
-                       /* vertex_counter contains the first primitive ID
-                        * for this dispatch. If the draw call was split into
-                        * multiple subdraws, the first primitive ID is > 0
-                        * for subsequent subdraws. Each subdraw uses a different
-                        * portion of the output index buffer. Offset the store
-                        * vindex by the first primitive ID to get the correct
-                        * store address for the subdraw.
-                        */
-                       start = LLVMBuildAdd(builder, start, vertex_counter, "");
-               }
-
-               /* Write indices for accepted primitives. */
-               LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
-               LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
-
-               if (!ac_has_vec3_support(ctx->ac.chip_class, true))
-                       vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
-
-               ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata,
-                                            vindex, ctx->ac.i32_0, 3,
-                                            ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
-       }
-       ac_build_endif(&ctx->ac, 16607);
-
-       LLVMBuildRetVoid(builder);
+   struct si_shader_key *key = &ctx->shader->key;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef vs = ctx->main_fn;
+
+   /* Always inline the VS function. */
+   ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
+   LLVMSetLinkage(vs, LLVMPrivateLinkage);
+
+   enum ac_arg_type const_desc_type;
+   if (ctx->shader->selector->info.const_buffers_declared == 1 &&
+       ctx->shader->selector->info.shader_buffers_declared == 0)
+      const_desc_type = AC_ARG_CONST_FLOAT_PTR;
+   else
+      const_desc_type = AC_ARG_CONST_DESC_PTR;
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
+   struct ac_arg param_vb_desc, param_const_desc;
+   struct ac_arg param_base_vertex, param_start_instance;
+   struct ac_arg param_block_id, param_local_id, param_ordered_wave_id;
+   struct ac_arg param_restart_index, param_smallprim_precision;
+   struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
+   struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr;
+
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
+              &param_index_buffers_and_constants);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_counter);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_last_wave_prim_id);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_count_addr);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &param_vb_desc);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, &param_const_desc);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, &param_sampler_desc);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_base_vertex);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_instance);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_multiplier);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_terms);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_restart_index);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &param_smallprim_precision);
+
+   /* Block ID and thread ID inputs. */
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_block_id);
+   if (VERTEX_COUNTER_GDS_MODE == 2)
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_ordered_wave_id);
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &param_local_id);
+
+   /* Create the compute shader function. */
+   unsigned old_type = ctx->type;
+   ctx->type = PIPE_SHADER_COMPUTE;
+   si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
+   ctx->type = old_type;
+
+   if (VERTEX_COUNTER_GDS_MODE == 2) {
+      ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
+   } else if (VERTEX_COUNTER_GDS_MODE == 1) {
+      ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", GDS_SIZE_UNORDERED);
+   }
+
+   /* Assemble parameters for VS. */
+   LLVMValueRef vs_params[16];
+   unsigned num_vs_params = 0;
+   unsigned param_vertex_id, param_instance_id;
+
+   vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
+   vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
+   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
+   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
+   vs_params[num_vs_params++] =
+      LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
+   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
+   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
+   vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
+   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
+
+   vs_params[(param_vertex_id = num_vs_params++)] = NULL;   /* VertexID */
+   vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
+   vs_params[num_vs_params++] = ctx->ac.i32_0;              /* unused (PrimID) */
+   vs_params[num_vs_params++] = ctx->ac.i32_0;              /* unused */
+
+   assert(num_vs_params <= ARRAY_SIZE(vs_params));
+   assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
+
+   /* Load descriptors. (load 8 dwords at once) */
+   LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
+
+   LLVMValueRef index_buffers_and_constants =
+      ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
+   tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
+                              ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
+   tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
+
+   for (unsigned i = 0; i < 8; i++)
+      desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
+
+   input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
+   output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
+
+   /* Compute PrimID and InstanceID. */
+   LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
+                                                 LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
+                                                 ac_get_arg(&ctx->ac, param_local_id));
+   LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
+   LLVMValueRef instance_id = ctx->ac.i32_0;
+
+   if (key->opt.cs_instancing) {
+      LLVMValueRef num_prims_udiv_terms = ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
+      LLVMValueRef num_prims_udiv_multiplier =
+         ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
+      /* Unpack num_prims_udiv_terms. */
+      LLVMValueRef post_shift =
+         LLVMBuildAnd(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
+      LLVMValueRef prims_per_instance =
+         LLVMBuildLShr(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 5, 0), "");
+      /* Divide the total prim_id by the number of prims per instance. */
+      instance_id =
+         ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, num_prims_udiv_multiplier, post_shift);
+      /* Compute the remainder. */
+      prim_id = LLVMBuildSub(builder, prim_id,
+                             LLVMBuildMul(builder, instance_id, prims_per_instance, ""), "");
+   }
+
+   /* Generate indices (like a non-indexed draw call). */
+   LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
+   unsigned vertices_per_prim = 3;
+
+   switch (key->opt.cs_prim_type) {
+   case PIPE_PRIM_TRIANGLES:
+      for (unsigned i = 0; i < 3; i++) {
+         index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0),
+                                  LLVMConstInt(ctx->ac.i32, i, 0));
+      }
+      break;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      for (unsigned i = 0; i < 3; i++) {
+         index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), "");
+      }
+      break;
+   case PIPE_PRIM_TRIANGLE_FAN:
+      /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
+       * and rasterizer as a normal triangle, so we need to put the provoking
+       * vertex into the correct index variable and preserve orientation at the same time.
+       * gl_VertexID is preserved, because it's equal to the index.
+       */
+      if (key->opt.cs_provoking_vertex_first) {
+         index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
+         index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
+         index[2] = ctx->ac.i32_0;
+      } else {
+         index[0] = ctx->ac.i32_0;
+         index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
+         index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
+      }
+      break;
+   default:
+      unreachable("unexpected primitive type");
+   }
+
+   /* Fetch indices. */
+   if (key->opt.cs_indexed) {
+      for (unsigned i = 0; i < 3; i++) {
+         index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,
+                                                1, 0, true);
+         index[i] = ac_to_integer(&ctx->ac, index[i]);
+      }
+   }
+
+   LLVMValueRef ordered_wave_id = NULL;
+
+   /* Extract the ordered wave ID. */
+   if (VERTEX_COUNTER_GDS_MODE == 2) {
+      ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id);
+      ordered_wave_id =
+         LLVMBuildLShr(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 6, 0), "");
+      ordered_wave_id =
+         LLVMBuildAnd(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 0xfff, 0), "");
+   }
+   LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
+                                         LLVMConstInt(ctx->ac.i32, 63, 0), "");
+
+   /* Every other triangle in a strip has a reversed vertex order, so we
+    * need to swap vertices of odd primitives to get the correct primitive
+    * orientation when converting triangle strips to triangles. Primitive
+    * restart complicates it, because a strip can start anywhere.
+    */
+   LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
+   LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
+
+   if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
+      /* Without primitive restart, odd primitives have reversed orientation.
+       * Only primitive restart can flip it with respect to the first vertex
+       * of the draw call.
+       */
+      LLVMValueRef first_is_odd = ctx->ac.i1false;
+
+      /* Handle primitive restart. */
+      if (key->opt.cs_primitive_restart) {
+         /* Get the GDS primitive restart continue flag and clear
+          * the flag in vertex_counter. This flag is used when the draw
+          * call was split and we need to load the primitive orientation
+          * flag from GDS for the first wave too.
+          */
+         LLVMValueRef gds_prim_restart_continue =
+            LLVMBuildLShr(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 31, 0), "");
+         gds_prim_restart_continue =
+            LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, "");
+         vertex_counter =
+            LLVMBuildAnd(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), "");
+
+         LLVMValueRef index0_is_reset;
+
+         for (unsigned i = 0; i < 3; i++) {
+            LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
+                                                   ac_get_arg(&ctx->ac, param_restart_index), "");
+            if (i == 0)
+               index0_is_reset = LLVMBuildNot(builder, not_reset, "");
+            prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, not_reset, "");
+         }
+
+         /* If the previous waves flip the primitive orientation
+          * of the current triangle strip, it will be stored in GDS.
+          *
+          * Sometimes the correct orientation is not needed, in which case
+          * we don't need to execute this.
+          */
+         if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
+            /* If there are reset indices in this wave, get the thread index
+             * where the most recent strip starts relative to each thread.
+             */
+            LLVMValueRef preceding_threads_mask =
+               LLVMBuildSub(builder,
+                            LLVMBuildShl(builder, ctx->ac.i64_1,
+                                         LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""),
+                            ctx->ac.i64_1, "");
+
+            LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
+            LLVMValueRef preceding_reset_threadmask =
+               LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
+            LLVMValueRef strip_start = ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
+            strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, "");
+
+            /* This flips the orientatino based on reset indices within this wave only. */
+            first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, "");
+
+            LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
+            LLVMValueRef is_first_wave, current_wave_resets_index;
+
+            /* Get the thread index where the last strip starts in this wave.
+             *
+             * If the last strip doesn't start in this wave, the thread index
+             * will be 0.
+             *
+             * If the last strip starts in the next wave, the thread index will
+             * be 64.
+             */
+            last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
+            last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, "");
+
+            struct si_thread0_section section;
+            si_enter_thread0_section(ctx, &section, thread_id);
+
+            /* This must be done in the thread 0 section, because
+             * we expect PrimID to be 0 for the whole first wave
+             * in this expression.
+             *
+             * NOTE: This will need to be different if we wanna support
+             * instancing with primitive restart.
+             */
+            is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, "");
+            is_first_wave = LLVMBuildAnd(builder, is_first_wave,
+                                         LLVMBuildNot(builder, gds_prim_restart_continue, ""), "");
+            current_wave_resets_index =
+               LLVMBuildICmp(builder, LLVMIntNE, last_strip_start, ctx->ac.i32_0, "");
+
+            ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state");
+
+            /* Save the last strip start primitive index in GDS and read
+             * the value that previous waves stored.
+             *
+             * if (is_first_wave || current_wave_resets_strip)
+             *    // Read the value that previous waves stored and store a new one.
+             *    first_is_odd = ds.ordered.swap(last_strip_start);
+             * else
+             *    // Just read the value that previous waves stored.
+             *    first_is_odd = ds.ordered.add(0);
+             */
+            ac_build_ifcc(
+               &ctx->ac, LLVMBuildOr(builder, is_first_wave, current_wave_resets_index, ""), 12602);
+            {
+               /* The GDS address is always 0 with ordered append. */
+               tmp = si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, last_strip_start, 1, true,
+                                            false);
+               LLVMBuildStore(builder, tmp, ret);
+            }
+            ac_build_else(&ctx->ac, 12603);
+            {
+               /* Just read the value from GDS. */
+               tmp = si_build_ds_ordered_op(ctx, "add", ordered_wave_id, ctx->ac.i32_0, 1, true,
+                                            false);
+               LLVMBuildStore(builder, tmp, ret);
+            }
+            ac_build_endif(&ctx->ac, 12602);
+
+            prev_wave_state = LLVMBuildLoad(builder, ret, "");
+            /* Ignore the return value if this is the first wave. */
+            prev_wave_state =
+               LLVMBuildSelect(builder, is_first_wave, ctx->ac.i32_0, prev_wave_state, "");
+            si_exit_thread0_section(&section, &prev_wave_state);
+            prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, "");
+
+            /* If the strip start appears to be on thread 0 for the current primitive
+             * (meaning the reset index is not present in this wave and might have
+             * appeared in previous waves), use the value from GDS to determine
+             * primitive orientation.
+             *
+             * If the strip start is in this wave for the current primitive, use
+             * the value from the current wave to determine primitive orientation.
+             */
+            LLVMValueRef strip_start_is0 =
+               LLVMBuildICmp(builder, LLVMIntEQ, strip_start, ctx->ac.i32_0, "");
+            first_is_odd =
+               LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, first_is_odd, "");
+         }
+      }
+      /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
+      LLVMValueRef prim_is_odd = LLVMBuildXor(
+         builder, first_is_odd, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
+
+      /* Convert triangle strip indices to triangle indices. */
+      ac_build_triangle_strip_indices_to_triangle(
+         &ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
+         index);
+   }
+
+   /* Execute the vertex shader for each vertex to get vertex positions. */
+   LLVMValueRef pos[3][4];
+   for (unsigned i = 0; i < vertices_per_prim; i++) {
+      vs_params[param_vertex_id] = index[i];
+      vs_params[param_instance_id] = instance_id;
+
+      LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
+      for (unsigned chan = 0; chan < 4; chan++)
+         pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
+   }
+
+   /* Divide XYZ by W. */
+   for (unsigned i = 0; i < vertices_per_prim; i++) {
+      for (unsigned chan = 0; chan < 3; chan++)
+         pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
+   }
+
+   /* Load the viewport state. */
+   LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
+                                             LLVMConstInt(ctx->ac.i32, 2, 0));
+   vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
+   LLVMValueRef vp_scale[2], vp_translate[2];
+   vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
+   vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
+   vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
+   vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
+
+   /* Do culling. */
+   struct ac_cull_options options = {};
+   options.cull_front = key->opt.cs_cull_front;
+   options.cull_back = key->opt.cs_cull_back;
+   options.cull_view_xy = true;
+   options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
+   options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
+   options.cull_small_prims = true;
+   options.cull_zero_area = true;
+   options.cull_w = true;
+   options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
+
+   LLVMValueRef accepted =
+      ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate,
+                       ac_get_arg(&ctx->ac, param_smallprim_precision), &options);
+
+   ac_build_optimization_barrier(&ctx->ac, &accepted);
+   LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
+
+   /* Count the number of active threads by doing bitcount(accepted). */
+   LLVMValueRef num_prims_accepted = ac_build_intrinsic(
+      &ctx->ac, "llvm.ctpop.i64", ctx->ac.i64, &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
+   num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
+
+   LLVMValueRef start;
+
+   /* Execute atomic_add on the vertex count. */
+   struct si_thread0_section section;
+   si_enter_thread0_section(ctx, &section, thread_id);
+   {
+      if (VERTEX_COUNTER_GDS_MODE == 0) {
+         LLVMValueRef num_indices = LLVMBuildMul(
+            builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+         vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
+         start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
+                                    LLVMAtomicOrderingMonotonic, false);
+      } else if (VERTEX_COUNTER_GDS_MODE == 1) {
+         LLVMValueRef num_indices = LLVMBuildMul(
+            builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+         vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
+                                            LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), "");
+         start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
+                                    LLVMAtomicOrderingMonotonic, false);
+      } else if (VERTEX_COUNTER_GDS_MODE == 2) {
+         LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+         /* If the draw call was split into multiple subdraws, each using
+          * a separate draw packet, we need to start counting from 0 for
+          * the first compute wave of the subdraw.
+          *
+          * vertex_counter contains the primitive ID of the first thread
+          * in the first wave.
+          *
+          * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
+          */
+         LLVMValueRef is_first_wave =
+            LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, vertex_counter, "");
+
+         /* Store the primitive count for ordered append, not vertex count.
+          * The idea is to avoid GDS initialization via CP DMA. The shader
+          * effectively stores the first count using "swap".
+          *
+          * if (first_wave) {
+          *    ds.ordered.swap(num_prims_accepted); // store the first primitive count
+          *    previous = 0;
+          * } else {
+          *    previous = ds.ordered.add(num_prims_accepted) // add the primitive count
+          * }
+          */
+         ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
+         {
+            /* The GDS address is always 0 with ordered append. */
+            si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, num_prims_accepted, 0, true, true);
+            LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store);
+         }
+         ac_build_else(&ctx->ac, 12605);
+         {
+            LLVMBuildStore(builder,
+                           si_build_ds_ordered_op(ctx, "add", ordered_wave_id, num_prims_accepted,
+                                                  0, true, true),
+                           tmp_store);
+         }
+         ac_build_endif(&ctx->ac, 12604);
+
+         start = LLVMBuildLoad(builder, tmp_store, "");
+      }
+   }
+   si_exit_thread0_section(&section, &start);
+
+   /* Write the final vertex count to memory. An EOS/EOP event could do this,
+    * but those events are super slow and should be avoided if performance
+    * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
+    * event like this.
+    */
+   if (VERTEX_COUNTER_GDS_MODE == 2) {
+      ac_build_ifcc(&ctx->ac,
+                    LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
+                                  ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""),
+                    12606);
+      LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
+      count = LLVMBuildMul(builder, count, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+
+      /* GFX8 needs to disable caching, so that the CP can see the stored value.
+       * MTYPE=3 bypasses TC L2.
+       */
+      if (ctx->screen->info.chip_class <= GFX8) {
+         LLVMValueRef desc[] = {
+            ac_get_arg(&ctx->ac, param_vertex_count_addr),
+            LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
+            LLVMConstInt(ctx->ac.i32, 4, 0),
+            LLVMConstInt(
+               ctx->ac.i32,
+               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | S_008F0C_MTYPE(3 /* uncached */),
+               0),
+         };
+         LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
+         ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0, ctx->ac.i32_0, 0,
+                                     ac_glc | ac_slc);
+      } else {
+         LLVMBuildStore(
+            builder, count,
+            si_expand_32bit_pointer(ctx, ac_get_arg(&ctx->ac, param_vertex_count_addr)));
+      }
+      ac_build_endif(&ctx->ac, 12606);
+   } else {
+      /* For unordered modes that increment a vertex count instead of
+       * primitive count, convert it into the primitive index.
+       */
+      start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+   }
+
+   /* Now we need to store the indices of accepted primitives into
+    * the output index buffer.
+    */
+   ac_build_ifcc(&ctx->ac, accepted, 16607);
+   {
+      /* Get the number of bits set before the index of this thread. */
+      LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
+
+      /* We have lowered instancing. Pack the instance ID into vertex ID. */
+      if (key->opt.cs_instancing) {
+         instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
+
+         for (unsigned i = 0; i < vertices_per_prim; i++)
+            index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
+      }
+
+      if (VERTEX_COUNTER_GDS_MODE == 2) {
+         /* vertex_counter contains the first primitive ID
+          * for this dispatch. If the draw call was split into
+          * multiple subdraws, the first primitive ID is > 0
+          * for subsequent subdraws. Each subdraw uses a different
+          * portion of the output index buffer. Offset the store
+          * vindex by the first primitive ID to get the correct
+          * store address for the subdraw.
+          */
+         start = LLVMBuildAdd(builder, start, vertex_counter, "");
+      }
+
+      /* Write indices for accepted primitives. */
+      LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
+      LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
+
+      if (!ac_has_vec3_support(ctx->ac.chip_class, true))
+         vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
+
+      ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0, 3,
+                                   ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
+   }
+   ac_build_endif(&ctx->ac, 16607);
+
+   LLVMBuildRetVoid(builder);
  }
  
  /* Return false if the shader isn't ready. */
  static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
-                                            const struct pipe_draw_info *info,
-                                            bool primitive_restart)
+                                             const struct pipe_draw_info *info,
+                                             bool primitive_restart)
  {
-       struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-       struct si_shader_key key;
-
-       /* Primitive restart needs ordered counters. */
-       assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
-       assert(!primitive_restart || info->instance_count == 1);
-
-       memset(&key, 0, sizeof(key));
-       si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
-       assert(!key.part.vs.prolog.instance_divisor_is_fetched);
-
-       key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
-       key.opt.vs_as_prim_discard_cs = 1;
-       key.opt.cs_prim_type = info->mode;
-       key.opt.cs_indexed = info->index_size != 0;
-       key.opt.cs_instancing = info->instance_count > 1;
-       key.opt.cs_primitive_restart = primitive_restart;
-       key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
-
-       /* Primitive restart with triangle strips needs to preserve primitive
-        * orientation for cases where front and back primitive orientation matters.
-        */
-       if (primitive_restart) {
-               struct si_shader_selector *ps = sctx->ps_shader.cso;
-
-               key.opt.cs_need_correct_orientation =
-                       rs->cull_front != rs->cull_back ||
-                       ps->info.uses_frontface ||
-                       (rs->two_side && ps->info.colors_read);
-       }
-
-       if (rs->rasterizer_discard) {
-               /* Just for performance testing and analysis of trivial bottlenecks.
-                * This should result in a very short compute shader. */
-               key.opt.cs_cull_front = 1;
-               key.opt.cs_cull_back = 1;
-       } else {
-               key.opt.cs_cull_front =
-                       sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
-               key.opt.cs_cull_back =
-                       sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
-       }
-
-       if (!rs->depth_clamp_any && CULL_Z) {
-               key.opt.cs_cull_z = 1;
-               key.opt.cs_halfz_clip_space = rs->clip_halfz;
-       }
-
-       sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
-       sctx->cs_prim_discard_state.current = NULL;
-
-       if (!sctx->compiler.passes)
-               si_init_compiler(sctx->screen, &sctx->compiler);
-
-       struct si_compiler_ctx_state compiler_state;
-       compiler_state.compiler = &sctx->compiler;
-       compiler_state.debug = sctx->debug;
-       compiler_state.is_debug_context = sctx->is_debug;
-
-       return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state,
-                                        &compiler_state, &key, -1, true) == 0 &&
-              /* Disallow compute shaders using the scratch buffer. */
-              sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   struct si_shader_key key;
+
+   /* Primitive restart needs ordered counters. */
+   assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
+   assert(!primitive_restart || info->instance_count == 1);
+
+   memset(&key, 0, sizeof(key));
+   si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
+   assert(!key.part.vs.prolog.instance_divisor_is_fetched);
+
+   key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
+   key.opt.vs_as_prim_discard_cs = 1;
+   key.opt.cs_prim_type = info->mode;
+   key.opt.cs_indexed = info->index_size != 0;
+   key.opt.cs_instancing = info->instance_count > 1;
+   key.opt.cs_primitive_restart = primitive_restart;
+   key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
+
+   /* Primitive restart with triangle strips needs to preserve primitive
+    * orientation for cases where front and back primitive orientation matters.
+    */
+   if (primitive_restart) {
+      struct si_shader_selector *ps = sctx->ps_shader.cso;
+
+      key.opt.cs_need_correct_orientation = rs->cull_front != rs->cull_back ||
+                                            ps->info.uses_frontface ||
+                                            (rs->two_side && ps->info.colors_read);
+   }
+
+   if (rs->rasterizer_discard) {
+      /* Just for performance testing and analysis of trivial bottlenecks.
+       * This should result in a very short compute shader. */
+      key.opt.cs_cull_front = 1;
+      key.opt.cs_cull_back = 1;
+   } else {
+      key.opt.cs_cull_front = sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
+      key.opt.cs_cull_back = sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
+   }
+
+   if (!rs->depth_clamp_any && CULL_Z) {
+      key.opt.cs_cull_z = 1;
+      key.opt.cs_halfz_clip_space = rs->clip_halfz;
+   }
+
+   sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
+   sctx->cs_prim_discard_state.current = NULL;
+
+   if (!sctx->compiler.passes)
+      si_init_compiler(sctx->screen, &sctx->compiler);
+
+   struct si_compiler_ctx_state compiler_state;
+   compiler_state.compiler = &sctx->compiler;
+   compiler_state.debug = sctx->debug;
+   compiler_state.is_debug_context = sctx->is_debug;
+
+   return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, &compiler_state,
+                                    &key, -1, true) == 0 &&
+          /* Disallow compute shaders using the scratch buffer. */
+          sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
  }
  
  static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
  {
-       if (sctx->index_ring)
-               return true;
-
-       if (!sctx->prim_discard_compute_cs) {
-               struct radeon_winsys *ws = sctx->ws;
-               unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED :
-                                   VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
-               unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
-
-               if (gds_size) {
-                       sctx->gds = ws->buffer_create(ws, gds_size, 4,
-                                                     RADEON_DOMAIN_GDS, 0);
-                       if (!sctx->gds)
-                               return false;
-
-                       ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
-                                         RADEON_USAGE_READWRITE, 0, 0);
-               }
-               if (num_oa_counters) {
-                       assert(gds_size);
-                       sctx->gds_oa = ws->buffer_create(ws, num_oa_counters,
-                                                        1, RADEON_DOMAIN_OA, 0);
-                       if (!sctx->gds_oa)
-                               return false;
-
-                       ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
-                                         RADEON_USAGE_READWRITE, 0, 0);
-               }
-
-               sctx->prim_discard_compute_cs =
-                       ws->cs_add_parallel_compute_ib(sctx->gfx_cs,
-                                                      num_oa_counters > 0);
-               if (!sctx->prim_discard_compute_cs)
-                       return false;
-       }
-
-       if (!sctx->index_ring) {
-               sctx->index_ring =
-                       si_aligned_buffer_create(sctx->b.screen,
-                                                SI_RESOURCE_FLAG_UNMAPPABLE,
-                                                PIPE_USAGE_DEFAULT,
-                                                sctx->index_ring_size_per_ib * 2,
-                                                sctx->screen->info.pte_fragment_size);
-               if (!sctx->index_ring)
-                       return false;
-       }
-       return true;
+   if (sctx->index_ring)
+      return true;
+
+   if (!sctx->prim_discard_compute_cs) {
+      struct radeon_winsys *ws = sctx->ws;
+      unsigned gds_size =
+         VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED : VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
+      unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
+
+      if (gds_size) {
+         sctx->gds = ws->buffer_create(ws, gds_size, 4, RADEON_DOMAIN_GDS, 0);
+         if (!sctx->gds)
+            return false;
+
+         ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0);
+      }
+      if (num_oa_counters) {
+         assert(gds_size);
+         sctx->gds_oa = ws->buffer_create(ws, num_oa_counters, 1, RADEON_DOMAIN_OA, 0);
+         if (!sctx->gds_oa)
+            return false;
+
+         ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0);
+      }
+
+      sctx->prim_discard_compute_cs =
+         ws->cs_add_parallel_compute_ib(sctx->gfx_cs, num_oa_counters > 0);
+      if (!sctx->prim_discard_compute_cs)
+         return false;
+   }
+
+   if (!sctx->index_ring) {
+      sctx->index_ring = si_aligned_buffer_create(
+         sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+         sctx->index_ring_size_per_ib * 2, sctx->screen->info.pte_fragment_size);
+      if (!sctx->index_ring)
+         return false;
+   }
+   return true;
  }
  
  static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
  {
-       return sctx->index_ring_offset +
-              align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
-              sctx->index_ring_size_per_ib;
+   return sctx->index_ring_offset +
+             align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
+          sctx->index_ring_size_per_ib;
  }
  
  enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
-                                     const struct pipe_draw_info *info,
-                                     bool primitive_restart)
+si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
+                                      bool primitive_restart)
  {
-       /* If the compute shader compilation isn't finished, this returns false. */
-       if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
-               return SI_PRIM_DISCARD_DISABLED;
-
-       if (!si_initialize_prim_discard_cmdbuf(sctx))
-               return SI_PRIM_DISCARD_DISABLED;
-
-       struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
-       unsigned prim = info->mode;
-       unsigned count = info->count;
-       unsigned instance_count = info->instance_count;
-       unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
-       unsigned num_prims = num_prims_per_instance * instance_count;
-       unsigned out_indexbuf_size = num_prims * 12;
-       bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
-       const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
-
-       /* Split draws at the draw call level if the ring is full. This makes
-        * better use of the ring space.
-        */
-       if (ring_full &&
-           num_prims > split_prims_draw_level &&
-           instance_count == 1 && /* TODO: support splitting instanced draws */
-           (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
-                          (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
-               /* Split draws. */
-               struct pipe_draw_info split_draw = *info;
-               split_draw.primitive_restart = primitive_restart;
-
-               unsigned base_start = split_draw.start;
-
-               if (prim == PIPE_PRIM_TRIANGLES) {
-                       unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
-                       assert(vert_count_per_subdraw < count);
-
-                       for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
-                               split_draw.start = base_start + start;
-                               split_draw.count = MIN2(count - start, vert_count_per_subdraw);
-
-                               sctx->b.draw_vbo(&sctx->b, &split_draw);
-                       }
-               } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
-                       /* No primitive pair can be split, because strips reverse orientation
-                        * for odd primitives. */
-                       STATIC_ASSERT(split_prims_draw_level % 2 == 0);
-
-                       unsigned vert_count_per_subdraw = split_prims_draw_level;
-
-                       for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
-                               split_draw.start = base_start + start;
-                               split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
-
-                               sctx->b.draw_vbo(&sctx->b, &split_draw);
-
-                               if (start == 0 &&
-                                   primitive_restart &&
-                                   sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
-                                       sctx->preserve_prim_restart_gds_at_flush = true;
-                       }
-                       sctx->preserve_prim_restart_gds_at_flush = false;
-               } else {
-                       assert(0);
-               }
-
-               return SI_PRIM_DISCARD_DRAW_SPLIT;
-       }
-
-       /* Just quit if the draw call doesn't fit into the ring and can't be split. */
-       if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
-               if (SI_PRIM_DISCARD_DEBUG)
-                       puts("PD failed: draw call too big, can't be split");
-               return SI_PRIM_DISCARD_DISABLED;
-       }
-
-       unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
-       unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
-                                  24 * (num_subdraws - 1) + /* subdraws */
-                                  20; /* leave some space at the end */
-       unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
-
-       if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
-               need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
-       else
-               need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
-
-       if (ring_full ||
-           (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
-           !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
-               /* If the current IB is empty but the size is too small, add a NOP
-                * packet to force a flush and get a bigger IB.
-                */
-               if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
-                   gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
-                       radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
-                       radeon_emit(gfx_cs, 0);
-               }
-
-               si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-       }
-
-       /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
-       struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
-       ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
-       assert(compute_has_space);
-       assert(si_check_ring_space(sctx, out_indexbuf_size));
-       return SI_PRIM_DISCARD_ENABLED;
+   /* If the compute shader compilation isn't finished, this returns false. */
+   if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
+      return SI_PRIM_DISCARD_DISABLED;
+
+   if (!si_initialize_prim_discard_cmdbuf(sctx))
+      return SI_PRIM_DISCARD_DISABLED;
+
+   struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
+   unsigned prim = info->mode;
+   unsigned count = info->count;
+   unsigned instance_count = info->instance_count;
+   unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
+   unsigned num_prims = num_prims_per_instance * instance_count;
+   unsigned out_indexbuf_size = num_prims * 12;
+   bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
+   const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
+
+   /* Split draws at the draw call level if the ring is full. This makes
+    * better use of the ring space.
+    */
+   if (ring_full && num_prims > split_prims_draw_level &&
+       instance_count == 1 && /* TODO: support splitting instanced draws */
+       (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
+      /* Split draws. */
+      struct pipe_draw_info split_draw = *info;
+      split_draw.primitive_restart = primitive_restart;
+
+      unsigned base_start = split_draw.start;
+
+      if (prim == PIPE_PRIM_TRIANGLES) {
+         unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
+         assert(vert_count_per_subdraw < count);
+
+         for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
+            split_draw.start = base_start + start;
+            split_draw.count = MIN2(count - start, vert_count_per_subdraw);
+
+            sctx->b.draw_vbo(&sctx->b, &split_draw);
+         }
+      } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
+         /* No primitive pair can be split, because strips reverse orientation
+          * for odd primitives. */
+         STATIC_ASSERT(split_prims_draw_level % 2 == 0);
+
+         unsigned vert_count_per_subdraw = split_prims_draw_level;
+
+         for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
+            split_draw.start = base_start + start;
+            split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
+
+            sctx->b.draw_vbo(&sctx->b, &split_draw);
+
+            if (start == 0 && primitive_restart &&
+                sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
+               sctx->preserve_prim_restart_gds_at_flush = true;
+         }
+         sctx->preserve_prim_restart_gds_at_flush = false;
+      } else {
+         assert(0);
+      }
+
+      return SI_PRIM_DISCARD_DRAW_SPLIT;
+   }
+
+   /* Just quit if the draw call doesn't fit into the ring and can't be split. */
+   if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
+      if (SI_PRIM_DISCARD_DEBUG)
+         puts("PD failed: draw call too big, can't be split");
+      return SI_PRIM_DISCARD_DISABLED;
+   }
+
+   unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
+   unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
+                              24 * (num_subdraws - 1) + /* subdraws */
+                              20;                       /* leave some space at the end */
+   unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
+
+   if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
+      need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
+   else
+      need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
+
+   if (ring_full ||
+       (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
+       !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
+      /* If the current IB is empty but the size is too small, add a NOP
+       * packet to force a flush and get a bigger IB.
+       */
+      if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
+          gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
+         radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
+         radeon_emit(gfx_cs, 0);
+      }
+
+      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+   }
+
+   /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
+   struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+   ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
+   assert(compute_has_space);
+   assert(si_check_ring_space(sctx, out_indexbuf_size));
+   return SI_PRIM_DISCARD_ENABLED;
  }
  
  void si_compute_signal_gfx(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
-       unsigned writeback_L2_flags = 0;
-
-       /* The writeback L2 flags vary with each chip generation. */
-       /* CI needs to flush vertex indices to memory. */
-       if (sctx->chip_class <= GFX7)
-               writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
-       else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
-               writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
-
-       if (!sctx->compute_num_prims_in_batch)
-               return;
-
-       assert(sctx->compute_rewind_va);
-
-       /* After the queued dispatches are done and vertex counts are written to
-        * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
-        * the dispatches to finish, it only adds the CS_DONE event into the event
-        * queue.
-        */
-       si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
-                         sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
-                         writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM :
-                                              EOP_INT_SEL_NONE,
-                         EOP_DATA_SEL_VALUE_32BIT,
-                         NULL,
-                         sctx->compute_rewind_va |
-                         ((uint64_t)sctx->screen->info.address32_hi << 32),
-                         REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
-                         SI_NOT_QUERY);
-
-       sctx->compute_rewind_va = 0;
-       sctx->compute_num_prims_in_batch = 0;
+   struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+   unsigned writeback_L2_flags = 0;
+
+   /* The writeback L2 flags vary with each chip generation. */
+   /* CI needs to flush vertex indices to memory. */
+   if (sctx->chip_class <= GFX7)
+      writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
+   else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
+      writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
+
+   if (!sctx->compute_num_prims_in_batch)
+      return;
+
+   assert(sctx->compute_rewind_va);
+
+   /* After the queued dispatches are done and vertex counts are written to
+    * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
+    * the dispatches to finish, it only adds the CS_DONE event into the event
+    * queue.
+    */
+   si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
+                     sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
+                     writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : EOP_INT_SEL_NONE,
+                     EOP_DATA_SEL_VALUE_32BIT, NULL,
+                     sctx->compute_rewind_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
+                     REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
+                     SI_NOT_QUERY);
+
+   sctx->compute_rewind_va = 0;
+   sctx->compute_num_prims_in_batch = 0;
  }
  
  /* Dispatch a primitive discard compute shader. */
  void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
-                                         const struct pipe_draw_info *info,
-                                         unsigned index_size,
-                                         unsigned base_vertex,
-                                         uint64_t input_indexbuf_va,
-                                         unsigned input_indexbuf_num_elements)
+                                          const struct pipe_draw_info *info, unsigned index_size,
+                                          unsigned base_vertex, uint64_t input_indexbuf_va,
+                                          unsigned input_indexbuf_num_elements)
  {
-       struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
-       struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
-       unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
-       if (!num_prims_per_instance)
-               return;
-
-       unsigned num_prims = num_prims_per_instance * info->instance_count;
-       unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
-
-       switch (info->mode) {
-       case PIPE_PRIM_TRIANGLES:
-       case PIPE_PRIM_TRIANGLE_STRIP:
-       case PIPE_PRIM_TRIANGLE_FAN:
-               vertices_per_prim = 3;
-               output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
-               gfx10_output_indexbuf_format = V_008F0C_IMG_FORMAT_32_32_32_UINT;
-               break;
-       default:
-               unreachable("unsupported primitive type");
-               return;
-       }
-
-       unsigned out_indexbuf_offset;
-       uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
-       bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
-
-       /* Initialize the compute IB if it's empty. */
-       if (!sctx->prim_discard_compute_ib_initialized) {
-               /* 1) State initialization. */
-               sctx->compute_gds_offset = 0;
-               sctx->compute_ib_last_shader = NULL;
-
-               if (sctx->last_ib_barrier_fence) {
-                       assert(!sctx->last_ib_barrier_buf);
-                       sctx->ws->cs_add_fence_dependency(gfx_cs,
-                                                         sctx->last_ib_barrier_fence,
-                                                         RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
-               }
-
-               /* 2) IB initialization. */
-
-               /* This needs to be done at the beginning of IBs due to possible
-                * TTM buffer moves in the kernel.
-                */
-               if (sctx->chip_class >= GFX10) {
-                       radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
-                       radeon_emit(cs, 0);             /* CP_COHER_CNTL */
-                       radeon_emit(cs, 0xffffffff);    /* CP_COHER_SIZE */
-                       radeon_emit(cs, 0xffffff);      /* CP_COHER_SIZE_HI */
-                       radeon_emit(cs, 0);             /* CP_COHER_BASE */
-                       radeon_emit(cs, 0);             /* CP_COHER_BASE_HI */
-                       radeon_emit(cs, 0x0000000A);    /* POLL_INTERVAL */
-                       radeon_emit(cs,                 /* GCR_CNTL */
-                                   S_586_GLI_INV(V_586_GLI_ALL) |
-                                   S_586_GLK_INV(1) | S_586_GLV_INV(1) |
-                                   S_586_GL1_INV(1) |
-                                   S_586_GL2_INV(1) | S_586_GL2_WB(1) |
-                                   S_586_GLM_INV(1) | S_586_GLM_WB(1) |
-                                   S_586_SEQ(V_586_SEQ_FORWARD));
-               } else {
-                       si_emit_surface_sync(sctx, cs,
-                                            S_0085F0_TC_ACTION_ENA(1) |
-                                            S_0085F0_TCL1_ACTION_ENA(1) |
-                                            S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
-                                            S_0085F0_SH_ICACHE_ACTION_ENA(1) |
-                                            S_0085F0_SH_KCACHE_ACTION_ENA(1));
-               }
-
-               /* Restore the GDS prim restart counter if needed. */
-               if (sctx->preserve_prim_restart_gds_at_flush) {
-                       si_cp_copy_data(sctx, cs,
-                                       COPY_DATA_GDS, NULL, 4,
-                                       COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4);
-               }
-
-               si_emit_initial_compute_regs(sctx, cs);
-
-               radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
-                                 S_00B860_WAVES(sctx->scratch_waves) |
-                                 S_00B860_WAVESIZE(0)); /* no scratch */
-
-               /* Only 1D grids are launched. */
-               radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
-               radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) |
-                               S_00B820_NUM_THREAD_PARTIAL(1));
-               radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) |
-                               S_00B824_NUM_THREAD_PARTIAL(1));
-
-               radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
-               radeon_emit(cs, 0);
-               radeon_emit(cs, 0);
-
-               /* Disable ordered alloc for OA resources. */
-               for (unsigned i = 0; i < 2; i++) {
-                       radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
-                       radeon_emit(cs, S_031074_INDEX(i));
-                       radeon_emit(cs, 0);
-                       radeon_emit(cs, S_03107C_ENABLE(0));
-               }
-
-               if (sctx->last_ib_barrier_buf) {
-                       assert(!sctx->last_ib_barrier_fence);
-                       radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf,
-                                                 RADEON_USAGE_READ, RADEON_PRIO_FENCE);
-                       si_cp_wait_mem(sctx, cs,
-                                      sctx->last_ib_barrier_buf->gpu_address +
-                                      sctx->last_ib_barrier_buf_offset, 1, 1,
-                                      WAIT_REG_MEM_EQUAL);
-               }
-
-               sctx->prim_discard_compute_ib_initialized = true;
-       }
-
-       /* Allocate the output index buffer. */
-       output_indexbuf_size = align(output_indexbuf_size,
-                                    sctx->screen->info.tcc_cache_line_size);
-       assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
-       out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
-       sctx->index_ring_offset += output_indexbuf_size;
-
-       radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
-                                 RADEON_PRIO_SHADER_RW_BUFFER);
-       uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
-
-       /* Prepare index buffer descriptors. */
-       struct si_resource *indexbuf_desc = NULL;
-       unsigned indexbuf_desc_offset;
-       unsigned desc_size = 12 * 4;
-       uint32_t *desc;
-
-       u_upload_alloc(sctx->b.const_uploader, 0, desc_size,
-                      si_optimal_tcc_alignment(sctx, desc_size),
-                      &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc,
-                      (void**)&desc);
-       radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
-                                 RADEON_PRIO_DESCRIPTORS);
-
-       /* Input index buffer. */
-       desc[0] = input_indexbuf_va;
-       desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) |
-                 S_008F04_STRIDE(index_size);
-       desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
-
-       if (sctx->chip_class >= GFX10) {
-               desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                         S_008F0C_FORMAT(index_size == 1 ? V_008F0C_IMG_FORMAT_8_UINT :
-                                         index_size == 2 ? V_008F0C_IMG_FORMAT_16_UINT :
-                                                           V_008F0C_IMG_FORMAT_32_UINT) |
-                         S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
-                         S_008F0C_RESOURCE_LEVEL(1);
-       } else {
-               desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                         S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
-                         S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 :
-                                              index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 :
-                                                                V_008F0C_BUF_DATA_FORMAT_32);
-       }
-
-       /* Output index buffer. */
-       desc[4] = out_indexbuf_va;
-       desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) |
-                 S_008F04_STRIDE(vertices_per_prim * 4);
-       desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
-
-       if (sctx->chip_class >= GFX10) {
-               desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
-                         S_008F0C_FORMAT(gfx10_output_indexbuf_format) |
-                         S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
-                         S_008F0C_RESOURCE_LEVEL(1);
-       } else {
-               desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
-                         S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
-                         S_008F0C_DATA_FORMAT(output_indexbuf_format);
-       }
-
-       /* Viewport state. */
-       struct si_small_prim_cull_info cull_info;
-       si_get_small_prim_cull_info(sctx, &cull_info);
-
-       desc[8] = fui(cull_info.scale[0]);
-       desc[9] = fui(cull_info.scale[1]);
-       desc[10] = fui(cull_info.translate[0]);
-       desc[11] = fui(cull_info.translate[1]);
-
-       /* Better subpixel precision increases the efficiency of small
-        * primitive culling. */
-       unsigned num_samples = sctx->framebuffer.nr_samples;
-       unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
-       float small_prim_cull_precision;
-
-       if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
-               small_prim_cull_precision = num_samples / 4096.0;
-       else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
-               small_prim_cull_precision = num_samples / 1024.0;
-       else
-               small_prim_cull_precision = num_samples / 256.0;
-
-       /* Set user data SGPRs. */
-       /* This can't be greater than 14 if we want the fastest launch rate. */
-       unsigned user_sgprs = 13;
-
-       uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
-       unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
-       unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
-       uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
-       uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
-       uint64_t vb_desc_va = sctx->vb_descriptors_buffer ?
-                                     sctx->vb_descriptors_buffer->gpu_address +
-                                     sctx->vb_descriptors_offset : 0;
-       unsigned gds_offset, gds_size;
-       struct si_fast_udiv_info32 num_prims_udiv = {};
-
-       if (info->instance_count > 1)
-               num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
-
-       /* Limitations on how these two are packed in the user SGPR. */
-       assert(num_prims_udiv.post_shift < 32);
-       assert(num_prims_per_instance < 1 << 27);
-
-       si_resource_reference(&indexbuf_desc, NULL);
-
-       bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
-
-       if (VERTEX_COUNTER_GDS_MODE == 1) {
-               gds_offset = sctx->compute_gds_offset;
-               gds_size = primitive_restart ? 8 : 4;
-               sctx->compute_gds_offset += gds_size;
-
-               /* Reset the counters in GDS for the first dispatch using WRITE_DATA.
-                * The remainder of the GDS will be cleared after the dispatch packet
-                * in parallel with compute shaders.
-                */
-               if (first_dispatch) {
-                       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0));
-                       radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
-                       radeon_emit(cs, gds_offset);
-                       radeon_emit(cs, 0);
-                       radeon_emit(cs, 0); /* value to write */
-                       if (gds_size == 8)
-                               radeon_emit(cs, 0);
-               }
-       }
-
-       /* Set shader registers. */
-       struct si_shader *shader = sctx->cs_prim_discard_state.current;
-
-       if (shader != sctx->compute_ib_last_shader) {
-               radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
-                                         RADEON_PRIO_SHADER_BINARY);
-               uint64_t shader_va = shader->bo->gpu_address;
-
-               assert(shader->config.scratch_bytes_per_wave == 0);
-               assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
-
-               radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
-               radeon_emit(cs, shader_va >> 8);
-               radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
-
-               radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
-               radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
-                               S_00B848_SGPRS(sctx->chip_class <= GFX9 ?
-                                              (shader->config.num_sgprs - 1) / 8 : 0) |
-                               S_00B848_FLOAT_MODE(shader->config.float_mode) |
-                               S_00B848_DX10_CLAMP(1) |
-                               S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |
-                               S_00B848_WGP_MODE(sctx->chip_class >= GFX10));
-               radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) |
-                               S_00B84C_USER_SGPR(user_sgprs) |
-                               S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
-                               S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
-                               S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
-                               S_00B84C_LDS_SIZE(shader->config.lds_size));
-
-               radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
-                       ac_get_compute_resource_limits(&sctx->screen->info,
-                                                      WAVES_PER_TG,
-                                                      MAX_WAVES_PER_SH,
-                                                      THREADGROUPS_PER_CU));
-               sctx->compute_ib_last_shader = shader;
-       }
-
-       STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
-
-       /* Big draw calls are split into smaller dispatches and draw packets. */
-       for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
-               unsigned num_subdraw_prims;
-
-               if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
-                       num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
-               else
-                       num_subdraw_prims = num_prims - start_prim;
-
-               /* Small dispatches are executed back to back until a specific primitive
-                * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
-                * to start drawing the batch. This batching adds latency to the gfx IB,
-                * but CS_DONE and REWIND are too slow.
-                */
-               if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
-                       si_compute_signal_gfx(sctx);
-
-               if (sctx->compute_num_prims_in_batch == 0) {
-                       assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
-                       sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
-
-                       if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
-                               radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
-                               radeon_emit(gfx_cs, 0);
-
-                               si_cp_wait_mem(sctx, gfx_cs,
-                                              sctx->compute_rewind_va |
-                                              (uint64_t)sctx->screen->info.address32_hi << 32,
-                                              REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT,
-                                              WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
-
-                               /* Use INDIRECT_BUFFER to chain to a different buffer
-                                * to discard the CP prefetch cache.
-                                */
-                               sctx->ws->cs_check_space(gfx_cs, 0, true);
-                       } else {
-                               radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
-                               radeon_emit(gfx_cs, 0);
-                       }
-               }
-
-               sctx->compute_num_prims_in_batch += num_subdraw_prims;
-
-               uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
-               uint64_t index_va = out_indexbuf_va + start_prim * 12;
-
-               /* Emit the draw packet into the gfx IB. */
-               radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
-               radeon_emit(gfx_cs, num_prims * vertices_per_prim);
-               radeon_emit(gfx_cs, index_va);
-               radeon_emit(gfx_cs, index_va >> 32);
-               radeon_emit(gfx_cs, 0);
-               radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
-
-               /* Continue with the compute IB. */
-               if (start_prim == 0) {
-                       uint32_t gds_prim_restart_continue_bit = 0;
-
-                       if (sctx->preserve_prim_restart_gds_at_flush) {
-                               assert(primitive_restart &&
-                                      info->mode == PIPE_PRIM_TRIANGLE_STRIP);
-                               assert(start_prim < 1 << 31);
-                               gds_prim_restart_continue_bit = 1 << 31;
-                       }
-
-                       radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
-                       radeon_emit(cs, index_buffers_va);
-                       radeon_emit(cs,
-                                   VERTEX_COUNTER_GDS_MODE == 0 ? count_va :
-                                   VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset :
-                                                                  start_prim |
-                                                                  gds_prim_restart_continue_bit);
-                       radeon_emit(cs, start_prim + num_subdraw_prims - 1);
-                       radeon_emit(cs, count_va);
-                       radeon_emit(cs, vb_desc_va);
-                       radeon_emit(cs, vs_const_desc_va);
-                       radeon_emit(cs, vs_sampler_desc_va);
-                       radeon_emit(cs, base_vertex);
-                       radeon_emit(cs, info->start_instance);
-                       radeon_emit(cs, num_prims_udiv.multiplier);
-                       radeon_emit(cs, num_prims_udiv.post_shift |
-                                       (num_prims_per_instance << 5));
-                       radeon_emit(cs, info->restart_index);
-                       /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
-                       radeon_emit(cs, fui(small_prim_cull_precision));
-               } else {
-                       assert(VERTEX_COUNTER_GDS_MODE == 2);
-                       /* Only update the SGPRs that changed. */
-                       radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
-                       radeon_emit(cs, start_prim);
-                       radeon_emit(cs, start_prim + num_subdraw_prims - 1);
-                       radeon_emit(cs, count_va);
-               }
-
-               /* Set grid dimensions. */
-               unsigned start_block = start_prim / THREADGROUP_SIZE;
-               unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
-               unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
-
-               radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
-               radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
-                                 S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
-                                 S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
-
-               radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
-                               PKT3_SHADER_TYPE_S(1));
-               radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
-               radeon_emit(cs, 1);
-               radeon_emit(cs, 1);
-               radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) |
-                               S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
-                               S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
-                               S_00B800_ORDER_MODE(0 /* launch in order */));
-
-               /* This is only for unordered append. Ordered append writes this from
-                * the shader.
-                *
-                * Note that EOP and EOS events are super slow, so emulating the event
-                * in a shader is an important optimization.
-                */
-               if (VERTEX_COUNTER_GDS_MODE == 1) {
-                       si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
-                                         sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
-                                         EOP_INT_SEL_NONE,
-                                         EOP_DATA_SEL_GDS,
-                                         NULL,
-                                         count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
-                                         EOP_DATA_GDS(gds_offset / 4, 1),
-                                         SI_NOT_QUERY);
-
-                       /* Now that compute shaders are running, clear the remainder of GDS. */
-                       if (first_dispatch) {
-                               unsigned offset = gds_offset + gds_size;
-                               si_cp_dma_clear_buffer(sctx, cs, NULL, offset,
-                                                      GDS_SIZE_UNORDERED - offset,
-                                                      0,
-                                                      SI_CPDMA_SKIP_CHECK_CS_SPACE |
-                                                      SI_CPDMA_SKIP_GFX_SYNC |
-                                                      SI_CPDMA_SKIP_SYNC_BEFORE,
-                                                      SI_COHERENCY_NONE, L2_BYPASS);
-                       }
-               }
-               first_dispatch = false;
-
-               assert(cs->current.cdw <= cs->current.max_dw);
-               assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
-       }
+   struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+   unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
+   if (!num_prims_per_instance)
+      return;
+
+   unsigned num_prims = num_prims_per_instance * info->instance_count;
+   unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
+
+   switch (info->mode) {
+   case PIPE_PRIM_TRIANGLES:
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_TRIANGLE_FAN:
+      vertices_per_prim = 3;
+      output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
+      gfx10_output_indexbuf_format = V_008F0C_IMG_FORMAT_32_32_32_UINT;
+      break;
+   default:
+      unreachable("unsupported primitive type");
+      return;
+   }
+
+   unsigned out_indexbuf_offset;
+   uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
+   bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
+
+   /* Initialize the compute IB if it's empty. */
+   if (!sctx->prim_discard_compute_ib_initialized) {
+      /* 1) State initialization. */
+      sctx->compute_gds_offset = 0;
+      sctx->compute_ib_last_shader = NULL;
+
+      if (sctx->last_ib_barrier_fence) {
+         assert(!sctx->last_ib_barrier_buf);
+         sctx->ws->cs_add_fence_dependency(gfx_cs, sctx->last_ib_barrier_fence,
+                                           RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
+      }
+
+      /* 2) IB initialization. */
+
+      /* This needs to be done at the beginning of IBs due to possible
+       * TTM buffer moves in the kernel.
+       */
+      if (sctx->chip_class >= GFX10) {
+         radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
+         radeon_emit(cs, 0);          /* CP_COHER_CNTL */
+         radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+         radeon_emit(cs, 0xffffff);   /* CP_COHER_SIZE_HI */
+         radeon_emit(cs, 0);          /* CP_COHER_BASE */
+         radeon_emit(cs, 0);          /* CP_COHER_BASE_HI */
+         radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+         radeon_emit(cs,              /* GCR_CNTL */
+                     S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) |
+                        S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) |
+                        S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD));
+      } else {
+         si_emit_surface_sync(sctx, cs,
+                              S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
+                                 S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
+                                 S_0085F0_SH_ICACHE_ACTION_ENA(1) |
+                                 S_0085F0_SH_KCACHE_ACTION_ENA(1));
+      }
+
+      /* Restore the GDS prim restart counter if needed. */
+      if (sctx->preserve_prim_restart_gds_at_flush) {
+         si_cp_copy_data(sctx, cs, COPY_DATA_GDS, NULL, 4, COPY_DATA_SRC_MEM,
+                         sctx->wait_mem_scratch, 4);
+      }
+
+      si_emit_initial_compute_regs(sctx, cs);
+
+      radeon_set_sh_reg(
+         cs, R_00B860_COMPUTE_TMPRING_SIZE,
+         S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */
+
+      /* Only 1D grids are launched. */
+      radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
+      radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | S_00B820_NUM_THREAD_PARTIAL(1));
+      radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | S_00B824_NUM_THREAD_PARTIAL(1));
+
+      radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
+      radeon_emit(cs, 0);
+      radeon_emit(cs, 0);
+
+      /* Disable ordered alloc for OA resources. */
+      for (unsigned i = 0; i < 2; i++) {
+         radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
+         radeon_emit(cs, S_031074_INDEX(i));
+         radeon_emit(cs, 0);
+         radeon_emit(cs, S_03107C_ENABLE(0));
+      }
+
+      if (sctx->last_ib_barrier_buf) {
+         assert(!sctx->last_ib_barrier_fence);
+         radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ,
+                                   RADEON_PRIO_FENCE);
+         si_cp_wait_mem(sctx, cs,
+                        sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset,
+                        1, 1, WAIT_REG_MEM_EQUAL);
+      }
+
+      sctx->prim_discard_compute_ib_initialized = true;
+   }
+
+   /* Allocate the output index buffer. */
+   output_indexbuf_size = align(output_indexbuf_size, sctx->screen->info.tcc_cache_line_size);
+   assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
+   out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
+   sctx->index_ring_offset += output_indexbuf_size;
+
+   radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
+                             RADEON_PRIO_SHADER_RW_BUFFER);
+   uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
+
+   /* Prepare index buffer descriptors. */
+   struct si_resource *indexbuf_desc = NULL;
+   unsigned indexbuf_desc_offset;
+   unsigned desc_size = 12 * 4;
+   uint32_t *desc;
+
+   u_upload_alloc(sctx->b.const_uploader, 0, desc_size, si_optimal_tcc_alignment(sctx, desc_size),
+                  &indexbuf_desc_offset, (struct pipe_resource **)&indexbuf_desc, (void **)&desc);
+   radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
+                             RADEON_PRIO_DESCRIPTORS);
+
+   /* Input index buffer. */
+   desc[0] = input_indexbuf_va;
+   desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size);
+   desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
+
+   if (sctx->chip_class >= GFX10) {
+      desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+                S_008F0C_FORMAT(index_size == 1 ? V_008F0C_IMG_FORMAT_8_UINT
+                                                : index_size == 2 ? V_008F0C_IMG_FORMAT_16_UINT
+                                                                  : V_008F0C_IMG_FORMAT_32_UINT) |
+                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
+                S_008F0C_RESOURCE_LEVEL(1);
+   } else {
+      desc[3] =
+         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+         S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8
+                                              : index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16
+                                                                : V_008F0C_BUF_DATA_FORMAT_32);
+   }
+
+   /* Output index buffer. */
+   desc[4] = out_indexbuf_va;
+   desc[5] =
+      S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4);
+   desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
+
+   if (sctx->chip_class >= GFX10) {
+      desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
+                S_008F0C_FORMAT(gfx10_output_indexbuf_format) |
+                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
+                S_008F0C_RESOURCE_LEVEL(1);
+   } else {
+      desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
+                S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+                S_008F0C_DATA_FORMAT(output_indexbuf_format);
+   }
+
+   /* Viewport state. */
+   struct si_small_prim_cull_info cull_info;
+   si_get_small_prim_cull_info(sctx, &cull_info);
+
+   desc[8] = fui(cull_info.scale[0]);
+   desc[9] = fui(cull_info.scale[1]);
+   desc[10] = fui(cull_info.translate[0]);
+   desc[11] = fui(cull_info.translate[1]);
+
+   /* Better subpixel precision increases the efficiency of small
+    * primitive culling. */
+   unsigned num_samples = sctx->framebuffer.nr_samples;
+   unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
+   float small_prim_cull_precision;
+
+   if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
+      small_prim_cull_precision = num_samples / 4096.0;
+   else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
+      small_prim_cull_precision = num_samples / 1024.0;
+   else
+      small_prim_cull_precision = num_samples / 256.0;
+
+   /* Set user data SGPRs. */
+   /* This can't be greater than 14 if we want the fastest launch rate. */
+   unsigned user_sgprs = 13;
+
+   uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
+   unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
+   unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
+   uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
+   uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
+   uint64_t vb_desc_va = sctx->vb_descriptors_buffer
+                            ? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset
+                            : 0;
+   unsigned gds_offset, gds_size;
+   struct si_fast_udiv_info32 num_prims_udiv = {};
+
+   if (info->instance_count > 1)
+      num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
+
+   /* Limitations on how these two are packed in the user SGPR. */
+   assert(num_prims_udiv.post_shift < 32);
+   assert(num_prims_per_instance < 1 << 27);
+
+   si_resource_reference(&indexbuf_desc, NULL);
+
+   bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
+
+   if (VERTEX_COUNTER_GDS_MODE == 1) {
+      gds_offset = sctx->compute_gds_offset;
+      gds_size = primitive_restart ? 8 : 4;
+      sctx->compute_gds_offset += gds_size;
+
+      /* Reset the counters in GDS for the first dispatch using WRITE_DATA.
+       * The remainder of the GDS will be cleared after the dispatch packet
+       * in parallel with compute shaders.
+       */
+      if (first_dispatch) {
+         radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size / 4, 0));
+         radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
+         radeon_emit(cs, gds_offset);
+         radeon_emit(cs, 0);
+         radeon_emit(cs, 0); /* value to write */
+         if (gds_size == 8)
+            radeon_emit(cs, 0);
+      }
+   }
+
+   /* Set shader registers. */
+   struct si_shader *shader = sctx->cs_prim_discard_state.current;
+
+   if (shader != sctx->compute_ib_last_shader) {
+      radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
+                                RADEON_PRIO_SHADER_BINARY);
+      uint64_t shader_va = shader->bo->gpu_address;
+
+      assert(shader->config.scratch_bytes_per_wave == 0);
+      assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
+
+      radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
+      radeon_emit(cs, shader_va >> 8);
+      radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
+
+      radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+      radeon_emit(
+         cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
+                S_00B848_SGPRS(sctx->chip_class <= GFX9 ? (shader->config.num_sgprs - 1) / 8 : 0) |
+                S_00B848_FLOAT_MODE(shader->config.float_mode) | S_00B848_DX10_CLAMP(1) |
+                S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |
+                S_00B848_WGP_MODE(sctx->chip_class >= GFX10));
+      radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | S_00B84C_USER_SGPR(user_sgprs) |
+                         S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
+                         S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
+                         S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
+                         S_00B84C_LDS_SIZE(shader->config.lds_size));
+
+      radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+                        ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG,
+                                                       MAX_WAVES_PER_SH, THREADGROUPS_PER_CU));
+      sctx->compute_ib_last_shader = shader;
+   }
+
+   STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
+
+   /* Big draw calls are split into smaller dispatches and draw packets. */
+   for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
+      unsigned num_subdraw_prims;
+
+      if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
+         num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
+      else
+         num_subdraw_prims = num_prims - start_prim;
+
+      /* Small dispatches are executed back to back until a specific primitive
+       * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
+       * to start drawing the batch. This batching adds latency to the gfx IB,
+       * but CS_DONE and REWIND are too slow.
+       */
+      if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
+         si_compute_signal_gfx(sctx);
+
+      if (sctx->compute_num_prims_in_batch == 0) {
+         assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
+         sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
+
+         if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
+            radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
+            radeon_emit(gfx_cs, 0);
+
+            si_cp_wait_mem(
+               sctx, gfx_cs,
+               sctx->compute_rewind_va | (uint64_t)sctx->screen->info.address32_hi << 32,
+               REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
+
+            /* Use INDIRECT_BUFFER to chain to a different buffer
+             * to discard the CP prefetch cache.
+             */
+            sctx->ws->cs_check_space(gfx_cs, 0, true);
+         } else {
+            radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
+            radeon_emit(gfx_cs, 0);
+         }
+      }
+
+      sctx->compute_num_prims_in_batch += num_subdraw_prims;
+
+      uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
+      uint64_t index_va = out_indexbuf_va + start_prim * 12;
+
+      /* Emit the draw packet into the gfx IB. */
+      radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
+      radeon_emit(gfx_cs, num_prims * vertices_per_prim);
+      radeon_emit(gfx_cs, index_va);
+      radeon_emit(gfx_cs, index_va >> 32);
+      radeon_emit(gfx_cs, 0);
+      radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
+
+      /* Continue with the compute IB. */
+      if (start_prim == 0) {
+         uint32_t gds_prim_restart_continue_bit = 0;
+
+         if (sctx->preserve_prim_restart_gds_at_flush) {
+            assert(primitive_restart && info->mode == PIPE_PRIM_TRIANGLE_STRIP);
+            assert(start_prim < 1 << 31);
+            gds_prim_restart_continue_bit = 1 << 31;
+         }
+
+         radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
+         radeon_emit(cs, index_buffers_va);
+         radeon_emit(cs, VERTEX_COUNTER_GDS_MODE == 0
+                            ? count_va
+                            : VERTEX_COUNTER_GDS_MODE == 1
+                                 ? gds_offset
+                                 : start_prim | gds_prim_restart_continue_bit);
+         radeon_emit(cs, start_prim + num_subdraw_prims - 1);
+         radeon_emit(cs, count_va);
+         radeon_emit(cs, vb_desc_va);
+         radeon_emit(cs, vs_const_desc_va);
+         radeon_emit(cs, vs_sampler_desc_va);
+         radeon_emit(cs, base_vertex);
+         radeon_emit(cs, info->start_instance);
+         radeon_emit(cs, num_prims_udiv.multiplier);
+         radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
+         radeon_emit(cs, info->restart_index);
+         /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
+         radeon_emit(cs, fui(small_prim_cull_precision));
+      } else {
+         assert(VERTEX_COUNTER_GDS_MODE == 2);
+         /* Only update the SGPRs that changed. */
+         radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
+         radeon_emit(cs, start_prim);
+         radeon_emit(cs, start_prim + num_subdraw_prims - 1);
+         radeon_emit(cs, count_va);
+      }
+
+      /* Set grid dimensions. */
+      unsigned start_block = start_prim / THREADGROUP_SIZE;
+      unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
+      unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
+
+      radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
+      radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
+                        S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
+                           S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
+
+      radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
+      radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
+      radeon_emit(cs, 1);
+      radeon_emit(cs, 1);
+      radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
+                         S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
+                         S_00B800_ORDER_MODE(0 /* launch in order */));
+
+      /* This is only for unordered append. Ordered append writes this from
+       * the shader.
+       *
+       * Note that EOP and EOS events are super slow, so emulating the event
+       * in a shader is an important optimization.
+       */
+      if (VERTEX_COUNTER_GDS_MODE == 1) {
+         si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
+                           sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
+                           EOP_INT_SEL_NONE, EOP_DATA_SEL_GDS, NULL,
+                           count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
+                           EOP_DATA_GDS(gds_offset / 4, 1), SI_NOT_QUERY);
+
+         /* Now that compute shaders are running, clear the remainder of GDS. */
+         if (first_dispatch) {
+            unsigned offset = gds_offset + gds_size;
+            si_cp_dma_clear_buffer(
+               sctx, cs, NULL, offset, GDS_SIZE_UNORDERED - offset, 0,
+               SI_CPDMA_SKIP_CHECK_CS_SPACE | SI_CPDMA_SKIP_GFX_SYNC | SI_CPDMA_SKIP_SYNC_BEFORE,
+               SI_COHERENCY_NONE, L2_BYPASS);
+         }
+      }
+      first_dispatch = false;
+
+      assert(cs->current.cdw <= cs->current.max_dw);
+      assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
+   }
  }
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c

index 2ef41e44ded6aeaa80e97abe5d2dfca87f0ea30d..391c4f8d50b9eb318be2e092cc9544b64ccbdc11 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -27,232 +27,221 @@
  
  /* Set this if you want the ME to wait until CP DMA is done.
   * It should be set on the last CP DMA packet. */
-#define CP_DMA_SYNC            (1 << 0)
+#define CP_DMA_SYNC (1 << 0)
  
  /* Set this if the source data was used as a destination in a previous CP DMA
   * packet. It's for preventing a read-after-write (RAW) hazard between two
   * CP DMA packets. */
-#define CP_DMA_RAW_WAIT                (1 << 1)
-#define CP_DMA_DST_IS_GDS      (1 << 2)
-#define CP_DMA_CLEAR           (1 << 3)
-#define CP_DMA_PFP_SYNC_ME     (1 << 4)
-#define CP_DMA_SRC_IS_GDS      (1 << 5)
+#define CP_DMA_RAW_WAIT    (1 << 1)
+#define CP_DMA_DST_IS_GDS  (1 << 2)
+#define CP_DMA_CLEAR       (1 << 3)
+#define CP_DMA_PFP_SYNC_ME (1 << 4)
+#define CP_DMA_SRC_IS_GDS  (1 << 5)
  
  /* The max number of bytes that can be copied per packet. */
  static inline unsigned cp_dma_max_byte_count(struct si_context *sctx)
  {
-       unsigned max = sctx->chip_class >= GFX9 ?
-                              S_414_BYTE_COUNT_GFX9(~0u) :
-                              S_414_BYTE_COUNT_GFX6(~0u);
+   unsigned max =
+      sctx->chip_class >= GFX9 ? S_414_BYTE_COUNT_GFX9(~0u) : S_414_BYTE_COUNT_GFX6(~0u);
  
-       /* make it aligned for optimal performance */
-       return max & ~(SI_CPDMA_ALIGNMENT - 1);
+   /* make it aligned for optimal performance */
+   return max & ~(SI_CPDMA_ALIGNMENT - 1);
  }
  
-
  /* Emit a CP DMA packet to do a copy from one buffer to another, or to clear
   * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va is a 32-bit
   * clear value.
   */
-static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs,
-                          uint64_t dst_va, uint64_t src_va, unsigned size,
-                          unsigned flags, enum si_cache_policy cache_policy)
+static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, uint64_t dst_va,
+                           uint64_t src_va, unsigned size, unsigned flags,
+                           enum si_cache_policy cache_policy)
  {
-       uint32_t header = 0, command = 0;
-
-       assert(size <= cp_dma_max_byte_count(sctx));
-       assert(sctx->chip_class != GFX6 || cache_policy == L2_BYPASS);
-
-       if (sctx->chip_class >= GFX9)
-               command |= S_414_BYTE_COUNT_GFX9(size);
-       else
-               command |= S_414_BYTE_COUNT_GFX6(size);
-
-       /* Sync flags. */
-       if (flags & CP_DMA_SYNC)
-               header |= S_411_CP_SYNC(1);
-       else {
-               if (sctx->chip_class >= GFX9)
-                       command |= S_414_DISABLE_WR_CONFIRM_GFX9(1);
-               else
-                       command |= S_414_DISABLE_WR_CONFIRM_GFX6(1);
-       }
-
-       if (flags & CP_DMA_RAW_WAIT)
-               command |= S_414_RAW_WAIT(1);
-
-       /* Src and dst flags. */
-       if (sctx->chip_class >= GFX9 && !(flags & CP_DMA_CLEAR) &&
-           src_va == dst_va) {
-               header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */
-       } else if (flags & CP_DMA_DST_IS_GDS) {
-               header |= S_411_DST_SEL(V_411_GDS);
-               /* GDS increments the address, not CP. */
-               command |= S_414_DAS(V_414_REGISTER) |
-                          S_414_DAIC(V_414_NO_INCREMENT);
-       } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) {
-               header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2) |
-                         S_500_DST_CACHE_POLICY(cache_policy == L2_STREAM);
-       }
-
-       if (flags & CP_DMA_CLEAR) {
-               header |= S_411_SRC_SEL(V_411_DATA);
-       } else if (flags & CP_DMA_SRC_IS_GDS) {
-               header |= S_411_SRC_SEL(V_411_GDS);
-               /* Both of these are required for GDS. It does increment the address. */
-               command |= S_414_SAS(V_414_REGISTER) |
-                          S_414_SAIC(V_414_NO_INCREMENT);
-       } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) {
-               header |= S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) |
-                         S_500_SRC_CACHE_POLICY(cache_policy == L2_STREAM);
-       }
-
-       if (sctx->chip_class >= GFX7) {
-               radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-               radeon_emit(cs, header);
-               radeon_emit(cs, src_va);        /* SRC_ADDR_LO [31:0] */
-               radeon_emit(cs, src_va >> 32);  /* SRC_ADDR_HI [31:0] */
-               radeon_emit(cs, dst_va);        /* DST_ADDR_LO [31:0] */
-               radeon_emit(cs, dst_va >> 32);  /* DST_ADDR_HI [31:0] */
-               radeon_emit(cs, command);
-       } else {
-               header |= S_411_SRC_ADDR_HI(src_va >> 32);
-
-               radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
-               radeon_emit(cs, src_va);        /* SRC_ADDR_LO [31:0] */
-               radeon_emit(cs, header);        /* SRC_ADDR_HI [15:0] + flags. */
-               radeon_emit(cs, dst_va);        /* DST_ADDR_LO [31:0] */
-               radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
-               radeon_emit(cs, command);
-       }
-
-       /* CP DMA is executed in ME, but index buffers are read by PFP.
-        * This ensures that ME (CP DMA) is idle before PFP starts fetching
-        * indices. If we wanted to execute CP DMA in PFP, this packet
-        * should precede it.
-        */
-       if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) {
-               radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-               radeon_emit(cs, 0);
-       }
+   uint32_t header = 0, command = 0;
+
+   assert(size <= cp_dma_max_byte_count(sctx));
+   assert(sctx->chip_class != GFX6 || cache_policy == L2_BYPASS);
+
+   if (sctx->chip_class >= GFX9)
+      command |= S_414_BYTE_COUNT_GFX9(size);
+   else
+      command |= S_414_BYTE_COUNT_GFX6(size);
+
+   /* Sync flags. */
+   if (flags & CP_DMA_SYNC)
+      header |= S_411_CP_SYNC(1);
+   else {
+      if (sctx->chip_class >= GFX9)
+         command |= S_414_DISABLE_WR_CONFIRM_GFX9(1);
+      else
+         command |= S_414_DISABLE_WR_CONFIRM_GFX6(1);
+   }
+
+   if (flags & CP_DMA_RAW_WAIT)
+      command |= S_414_RAW_WAIT(1);
+
+   /* Src and dst flags. */
+   if (sctx->chip_class >= GFX9 && !(flags & CP_DMA_CLEAR) && src_va == dst_va) {
+      header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */
+   } else if (flags & CP_DMA_DST_IS_GDS) {
+      header |= S_411_DST_SEL(V_411_GDS);
+      /* GDS increments the address, not CP. */
+      command |= S_414_DAS(V_414_REGISTER) | S_414_DAIC(V_414_NO_INCREMENT);
+   } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) {
+      header |=
+         S_411_DST_SEL(V_411_DST_ADDR_TC_L2) | S_500_DST_CACHE_POLICY(cache_policy == L2_STREAM);
+   }
+
+   if (flags & CP_DMA_CLEAR) {
+      header |= S_411_SRC_SEL(V_411_DATA);
+   } else if (flags & CP_DMA_SRC_IS_GDS) {
+      header |= S_411_SRC_SEL(V_411_GDS);
+      /* Both of these are required for GDS. It does increment the address. */
+      command |= S_414_SAS(V_414_REGISTER) | S_414_SAIC(V_414_NO_INCREMENT);
+   } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) {
+      header |=
+         S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) | S_500_SRC_CACHE_POLICY(cache_policy == L2_STREAM);
+   }
+
+   if (sctx->chip_class >= GFX7) {
+      radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+      radeon_emit(cs, header);
+      radeon_emit(cs, src_va);       /* SRC_ADDR_LO [31:0] */
+      radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */
+      radeon_emit(cs, dst_va);       /* DST_ADDR_LO [31:0] */
+      radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */
+      radeon_emit(cs, command);
+   } else {
+      header |= S_411_SRC_ADDR_HI(src_va >> 32);
+
+      radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
+      radeon_emit(cs, src_va);                  /* SRC_ADDR_LO [31:0] */
+      radeon_emit(cs, header);                  /* SRC_ADDR_HI [15:0] + flags. */
+      radeon_emit(cs, dst_va);                  /* DST_ADDR_LO [31:0] */
+      radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
+      radeon_emit(cs, command);
+   }
+
+   /* CP DMA is executed in ME, but index buffers are read by PFP.
+    * This ensures that ME (CP DMA) is idle before PFP starts fetching
+    * indices. If we wanted to execute CP DMA in PFP, this packet
+    * should precede it.
+    */
+   if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) {
+      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+      radeon_emit(cs, 0);
+   }
  }
  
  void si_cp_dma_wait_for_idle(struct si_context *sctx)
  {
-       /* Issue a dummy DMA that copies zero bytes.
-        *
-        * The DMA engine will see that there's no work to do and skip this
-        * DMA request, however, the CP will see the sync flag and still wait
-        * for all DMAs to complete.
-        */
-       si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
+   /* Issue a dummy DMA that copies zero bytes.
+    *
+    * The DMA engine will see that there's no work to do and skip this
+    * DMA request, however, the CP will see the sync flag and still wait
+    * for all DMAs to complete.
+    */
+   si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
  }
  
  static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
-                             struct pipe_resource *src, unsigned byte_count,
-                             uint64_t remaining_size, unsigned user_flags,
-                             enum si_coherency coher, bool *is_first,
-                             unsigned *packet_flags)
+                              struct pipe_resource *src, unsigned byte_count,
+                              uint64_t remaining_size, unsigned user_flags, enum si_coherency coher,
+                              bool *is_first, unsigned *packet_flags)
  {
-       /* Fast exit for a CPDMA prefetch. */
-       if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) {
-               *is_first = false;
-               return;
-       }
-
-       if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
-               /* Count memory usage in so that need_cs_space can take it into account. */
-               if (dst)
-                       si_context_add_resource_size(sctx, dst);
-               if (src)
-                       si_context_add_resource_size(sctx, src);
-       }
-
-       if (!(user_flags & SI_CPDMA_SKIP_CHECK_CS_SPACE))
-               si_need_gfx_cs_space(sctx);
-
-       /* This must be done after need_cs_space. */
-       if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
-               if (dst)
-                       radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                                 si_resource(dst),
-                                                 RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
-               if (src)
-                       radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                                 si_resource(src),
-                                                 RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
-       }
-
-       /* Flush the caches for the first copy only.
-        * Also wait for the previous CP DMA operations.
-        */
-       if (!(user_flags & SI_CPDMA_SKIP_GFX_SYNC) && sctx->flags)
-               sctx->emit_cache_flush(sctx);
-
-       if (!(user_flags & SI_CPDMA_SKIP_SYNC_BEFORE) && *is_first &&
-           !(*packet_flags & CP_DMA_CLEAR))
-               *packet_flags |= CP_DMA_RAW_WAIT;
-
-       *is_first = false;
-
-       /* Do the synchronization after the last dma, so that all data
-        * is written to memory.
-        */
-       if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) &&
-           byte_count == remaining_size) {
-               *packet_flags |= CP_DMA_SYNC;
-
-               if (coher == SI_COHERENCY_SHADER)
-                       *packet_flags |= CP_DMA_PFP_SYNC_ME;
-       }
+   /* Fast exit for a CPDMA prefetch. */
+   if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) {
+      *is_first = false;
+      return;
+   }
+
+   if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
+      /* Count memory usage in so that need_cs_space can take it into account. */
+      if (dst)
+         si_context_add_resource_size(sctx, dst);
+      if (src)
+         si_context_add_resource_size(sctx, src);
+   }
+
+   if (!(user_flags & SI_CPDMA_SKIP_CHECK_CS_SPACE))
+      si_need_gfx_cs_space(sctx);
+
+   /* This must be done after need_cs_space. */
+   if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
+      if (dst)
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(dst), RADEON_USAGE_WRITE,
+                                   RADEON_PRIO_CP_DMA);
+      if (src)
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(src), RADEON_USAGE_READ,
+                                   RADEON_PRIO_CP_DMA);
+   }
+
+   /* Flush the caches for the first copy only.
+    * Also wait for the previous CP DMA operations.
+    */
+   if (!(user_flags & SI_CPDMA_SKIP_GFX_SYNC) && sctx->flags)
+      sctx->emit_cache_flush(sctx);
+
+   if (!(user_flags & SI_CPDMA_SKIP_SYNC_BEFORE) && *is_first && !(*packet_flags & CP_DMA_CLEAR))
+      *packet_flags |= CP_DMA_RAW_WAIT;
+
+   *is_first = false;
+
+   /* Do the synchronization after the last dma, so that all data
+    * is written to memory.
+    */
+   if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) && byte_count == remaining_size) {
+      *packet_flags |= CP_DMA_SYNC;
+
+      if (coher == SI_COHERENCY_SHADER)
+         *packet_flags |= CP_DMA_PFP_SYNC_ME;
+   }
  }
  
  void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
-                           struct pipe_resource *dst, uint64_t offset,
-                           uint64_t size, unsigned value, unsigned user_flags,
-                           enum si_coherency coher, enum si_cache_policy cache_policy)
+                            struct pipe_resource *dst, uint64_t offset, uint64_t size,
+                            unsigned value, unsigned user_flags, enum si_coherency coher,
+                            enum si_cache_policy cache_policy)
  {
-       struct si_resource *sdst = si_resource(dst);
-       uint64_t va = (sdst ? sdst->gpu_address : 0) + offset;
-       bool is_first = true;
-
-       assert(size && size % 4 == 0);
-
-       /* Mark the buffer range of destination as valid (initialized),
-        * so that transfer_map knows it should wait for the GPU when mapping
-        * that range. */
-       if (sdst)
-               util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
-
-       /* Flush the caches. */
-       if (sdst && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
-               sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-                              SI_CONTEXT_CS_PARTIAL_FLUSH |
-                              si_get_flush_flags(sctx, coher, cache_policy);
-       }
-
-       while (size) {
-               unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
-               unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS);
-
-               si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, user_flags,
-                                 coher, &is_first, &dma_flags);
-
-               /* Emit the clear packet. */
-               si_emit_cp_dma(sctx, cs, va, value, byte_count, dma_flags, cache_policy);
-
-               size -= byte_count;
-               va += byte_count;
-       }
-
-       if (sdst && cache_policy != L2_BYPASS)
-               sdst->TC_L2_dirty = true;
-
-       /* If it's not a framebuffer fast clear... */
-       if (coher == SI_COHERENCY_SHADER) {
-               sctx->num_cp_dma_calls++;
-               si_prim_discard_signal_next_compute_ib_start(sctx);
-       }
+   struct si_resource *sdst = si_resource(dst);
+   uint64_t va = (sdst ? sdst->gpu_address : 0) + offset;
+   bool is_first = true;
+
+   assert(size && size % 4 == 0);
+
+   /* Mark the buffer range of destination as valid (initialized),
+    * so that transfer_map knows it should wait for the GPU when mapping
+    * that range. */
+   if (sdst)
+      util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
+
+   /* Flush the caches. */
+   if (sdst && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
+      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+                     si_get_flush_flags(sctx, coher, cache_policy);
+   }
+
+   while (size) {
+      unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
+      unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS);
+
+      si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, user_flags, coher, &is_first,
+                        &dma_flags);
+
+      /* Emit the clear packet. */
+      si_emit_cp_dma(sctx, cs, va, value, byte_count, dma_flags, cache_policy);
+
+      size -= byte_count;
+      va += byte_count;
+   }
+
+   if (sdst && cache_policy != L2_BYPASS)
+      sdst->TC_L2_dirty = true;
+
+   /* If it's not a framebuffer fast clear... */
+   if (coher == SI_COHERENCY_SHADER) {
+      sctx->num_cp_dma_calls++;
+      si_prim_discard_signal_next_compute_ib_start(sctx);
+   }
  }
  
  /**
@@ -261,41 +250,34 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
   *
   * \param size  Remaining size to the CP DMA alignment.
   */
-static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
-                                    unsigned user_flags, enum si_coherency coher,
-                                    enum si_cache_policy cache_policy,
-                                    bool *is_first)
+static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, unsigned user_flags,
+                                     enum si_coherency coher, enum si_cache_policy cache_policy,
+                                     bool *is_first)
  {
-       uint64_t va;
-       unsigned dma_flags = 0;
-       unsigned scratch_size = SI_CPDMA_ALIGNMENT * 2;
-
-       assert(size < SI_CPDMA_ALIGNMENT);
-
-       /* Use the scratch buffer as the dummy buffer. The 3D engine should be
-        * idle at this point.
-        */
-       if (!sctx->scratch_buffer ||
-           sctx->scratch_buffer->b.b.width0 < scratch_size) {
-               si_resource_reference(&sctx->scratch_buffer, NULL);
-               sctx->scratch_buffer =
-                       si_aligned_buffer_create(&sctx->screen->b,
-                                                  SI_RESOURCE_FLAG_UNMAPPABLE,
-                                                  PIPE_USAGE_DEFAULT,
-                                                  scratch_size, 256);
-               if (!sctx->scratch_buffer)
-                       return;
-
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
-       }
-
-       si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
-                         &sctx->scratch_buffer->b.b, size, size, user_flags,
-                         coher, is_first, &dma_flags);
-
-       va = sctx->scratch_buffer->gpu_address;
-       si_emit_cp_dma(sctx, sctx->gfx_cs, va, va + SI_CPDMA_ALIGNMENT, size, dma_flags,
-                      cache_policy);
+   uint64_t va;
+   unsigned dma_flags = 0;
+   unsigned scratch_size = SI_CPDMA_ALIGNMENT * 2;
+
+   assert(size < SI_CPDMA_ALIGNMENT);
+
+   /* Use the scratch buffer as the dummy buffer. The 3D engine should be
+    * idle at this point.
+    */
+   if (!sctx->scratch_buffer || sctx->scratch_buffer->b.b.width0 < scratch_size) {
+      si_resource_reference(&sctx->scratch_buffer, NULL);
+      sctx->scratch_buffer = si_aligned_buffer_create(&sctx->screen->b, SI_RESOURCE_FLAG_UNMAPPABLE,
+                                                      PIPE_USAGE_DEFAULT, scratch_size, 256);
+      if (!sctx->scratch_buffer)
+         return;
+
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
+   }
+
+   si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b, &sctx->scratch_buffer->b.b, size, size,
+                     user_flags, coher, is_first, &dma_flags);
+
+   va = sctx->scratch_buffer->gpu_address;
+   si_emit_cp_dma(sctx, sctx->gfx_cs, va, va + SI_CPDMA_ALIGNMENT, size, dma_flags, cache_policy);
  }
  
  /**
@@ -304,141 +286,131 @@ static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
   *
   * \param user_flags   bitmask of SI_CPDMA_*
   */
-void si_cp_dma_copy_buffer(struct si_context *sctx,
-                          struct pipe_resource *dst, struct pipe_resource *src,
-                          uint64_t dst_offset, uint64_t src_offset, unsigned size,
-                          unsigned user_flags, enum si_coherency coher,
-                          enum si_cache_policy cache_policy)
+void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
+                           struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
+                           unsigned size, unsigned user_flags, enum si_coherency coher,
+                           enum si_cache_policy cache_policy)
  {
-       uint64_t main_dst_offset, main_src_offset;
-       unsigned skipped_size = 0;
-       unsigned realign_size = 0;
-       unsigned gds_flags = (dst ? 0 : CP_DMA_DST_IS_GDS) |
-                            (src ? 0 : CP_DMA_SRC_IS_GDS);
-       bool is_first = true;
-
-       assert(size);
-
-       if (dst) {
-               /* Skip this for the L2 prefetch. */
-               if (dst != src || dst_offset != src_offset) {
-                       /* Mark the buffer range of destination as valid (initialized),
-                        * so that transfer_map knows it should wait for the GPU when mapping
-                        * that range. */
-                       util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset,
-                                      dst_offset + size);
-               }
-
-               dst_offset += si_resource(dst)->gpu_address;
-       }
-       if (src)
-               src_offset += si_resource(src)->gpu_address;
-
-       /* The workarounds aren't needed on Fiji and beyond. */
-       if (sctx->family <= CHIP_CARRIZO ||
-           sctx->family == CHIP_STONEY) {
-               /* If the size is not aligned, we must add a dummy copy at the end
-                * just to align the internal counter. Otherwise, the DMA engine
-                * would slow down by an order of magnitude for following copies.
-                */
-               if (size % SI_CPDMA_ALIGNMENT)
-                       realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT);
-
-               /* If the copy begins unaligned, we must start copying from the next
-                * aligned block and the skipped part should be copied after everything
-                * else has been copied. Only the src alignment matters, not dst.
-                *
-                * GDS doesn't need the source address to be aligned.
-                */
-               if (src && src_offset % SI_CPDMA_ALIGNMENT) {
-                       skipped_size = SI_CPDMA_ALIGNMENT - (src_offset % SI_CPDMA_ALIGNMENT);
-                       /* The main part will be skipped if the size is too small. */
-                       skipped_size = MIN2(skipped_size, size);
-                       size -= skipped_size;
-               }
-       }
-
-       /* Flush the caches. */
-       if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
-               sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-                              SI_CONTEXT_CS_PARTIAL_FLUSH |
-                              si_get_flush_flags(sctx, coher, cache_policy);
-       }
-
-       /* This is the main part doing the copying. Src is always aligned. */
-       main_dst_offset = dst_offset + skipped_size;
-       main_src_offset = src_offset + skipped_size;
-
-       while (size) {
-               unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
-               unsigned dma_flags = gds_flags;
-
-               si_cp_dma_prepare(sctx, dst, src, byte_count,
-                                 size + skipped_size + realign_size,
-                                 user_flags, coher, &is_first, &dma_flags);
-
-               si_emit_cp_dma(sctx, sctx->gfx_cs, main_dst_offset, main_src_offset,
-                              byte_count, dma_flags, cache_policy);
-
-               size -= byte_count;
-               main_src_offset += byte_count;
-               main_dst_offset += byte_count;
-       }
-
-       /* Copy the part we skipped because src wasn't aligned. */
-       if (skipped_size) {
-               unsigned dma_flags = gds_flags;
-
-               si_cp_dma_prepare(sctx, dst, src, skipped_size,
-                                 skipped_size + realign_size, user_flags,
-                                 coher, &is_first, &dma_flags);
-
-               si_emit_cp_dma(sctx, sctx->gfx_cs, dst_offset, src_offset, skipped_size,
-                              dma_flags, cache_policy);
-       }
-
-       /* Finally, realign the engine if the size wasn't aligned. */
-       if (realign_size) {
-               si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher,
-                                        cache_policy, &is_first);
-       }
-
-       if (dst && cache_policy != L2_BYPASS)
-               si_resource(dst)->TC_L2_dirty = true;
-
-       /* If it's not a prefetch or GDS copy... */
-       if (dst && src && (dst != src || dst_offset != src_offset)) {
-               sctx->num_cp_dma_calls++;
-               si_prim_discard_signal_next_compute_ib_start(sctx);
-       }
+   uint64_t main_dst_offset, main_src_offset;
+   unsigned skipped_size = 0;
+   unsigned realign_size = 0;
+   unsigned gds_flags = (dst ? 0 : CP_DMA_DST_IS_GDS) | (src ? 0 : CP_DMA_SRC_IS_GDS);
+   bool is_first = true;
+
+   assert(size);
+
+   if (dst) {
+      /* Skip this for the L2 prefetch. */
+      if (dst != src || dst_offset != src_offset) {
+         /* Mark the buffer range of destination as valid (initialized),
+          * so that transfer_map knows it should wait for the GPU when mapping
+          * that range. */
+         util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size);
+      }
+
+      dst_offset += si_resource(dst)->gpu_address;
+   }
+   if (src)
+      src_offset += si_resource(src)->gpu_address;
+
+   /* The workarounds aren't needed on Fiji and beyond. */
+   if (sctx->family <= CHIP_CARRIZO || sctx->family == CHIP_STONEY) {
+      /* If the size is not aligned, we must add a dummy copy at the end
+       * just to align the internal counter. Otherwise, the DMA engine
+       * would slow down by an order of magnitude for following copies.
+       */
+      if (size % SI_CPDMA_ALIGNMENT)
+         realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT);
+
+      /* If the copy begins unaligned, we must start copying from the next
+       * aligned block and the skipped part should be copied after everything
+       * else has been copied. Only the src alignment matters, not dst.
+       *
+       * GDS doesn't need the source address to be aligned.
+       */
+      if (src && src_offset % SI_CPDMA_ALIGNMENT) {
+         skipped_size = SI_CPDMA_ALIGNMENT - (src_offset % SI_CPDMA_ALIGNMENT);
+         /* The main part will be skipped if the size is too small. */
+         skipped_size = MIN2(skipped_size, size);
+         size -= skipped_size;
+      }
+   }
+
+   /* Flush the caches. */
+   if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
+      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+                     si_get_flush_flags(sctx, coher, cache_policy);
+   }
+
+   /* This is the main part doing the copying. Src is always aligned. */
+   main_dst_offset = dst_offset + skipped_size;
+   main_src_offset = src_offset + skipped_size;
+
+   while (size) {
+      unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
+      unsigned dma_flags = gds_flags;
+
+      si_cp_dma_prepare(sctx, dst, src, byte_count, size + skipped_size + realign_size, user_flags,
+                        coher, &is_first, &dma_flags);
+
+      si_emit_cp_dma(sctx, sctx->gfx_cs, main_dst_offset, main_src_offset, byte_count, dma_flags,
+                     cache_policy);
+
+      size -= byte_count;
+      main_src_offset += byte_count;
+      main_dst_offset += byte_count;
+   }
+
+   /* Copy the part we skipped because src wasn't aligned. */
+   if (skipped_size) {
+      unsigned dma_flags = gds_flags;
+
+      si_cp_dma_prepare(sctx, dst, src, skipped_size, skipped_size + realign_size, user_flags,
+                        coher, &is_first, &dma_flags);
+
+      si_emit_cp_dma(sctx, sctx->gfx_cs, dst_offset, src_offset, skipped_size, dma_flags,
+                     cache_policy);
+   }
+
+   /* Finally, realign the engine if the size wasn't aligned. */
+   if (realign_size) {
+      si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher, cache_policy, &is_first);
+   }
+
+   if (dst && cache_policy != L2_BYPASS)
+      si_resource(dst)->TC_L2_dirty = true;
+
+   /* If it's not a prefetch or GDS copy... */
+   if (dst && src && (dst != src || dst_offset != src_offset)) {
+      sctx->num_cp_dma_calls++;
+      si_prim_discard_signal_next_compute_ib_start(sctx);
+   }
  }
  
-void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
-                             uint64_t offset, unsigned size)
+void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset,
+                              unsigned size)
  {
-       assert(sctx->chip_class >= GFX7);
+   assert(sctx->chip_class >= GFX7);
  
-       si_cp_dma_copy_buffer(sctx, buf, buf, offset, offset, size,
-                             SI_CPDMA_SKIP_ALL, SI_COHERENCY_SHADER, L2_LRU);
+   si_cp_dma_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL,
+                         SI_COHERENCY_SHADER, L2_LRU);
  }
  
-static void cik_prefetch_shader_async(struct si_context *sctx,
-                                     struct si_pm4_state *state)
+static void cik_prefetch_shader_async(struct si_context *sctx, struct si_pm4_state *state)
  {
-       struct pipe_resource *bo = &state->bo[0]->b.b;
-       assert(state->nbo == 1);
+   struct pipe_resource *bo = &state->bo[0]->b.b;
+   assert(state->nbo == 1);
  
-       cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
+   cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
  }
  
  static void cik_prefetch_VBO_descriptors(struct si_context *sctx)
  {
-       if (!sctx->vertex_elements || !sctx->vertex_elements->vb_desc_list_alloc_size)
-               return;
+   if (!sctx->vertex_elements || !sctx->vertex_elements->vb_desc_list_alloc_size)
+      return;
  
-       cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b,
-                                sctx->vb_descriptors_offset,
-                                sctx->vertex_elements->vb_desc_list_alloc_size);
+   cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,
+                            sctx->vertex_elements->vb_desc_list_alloc_size);
  }
  
  /**
@@ -449,191 +421,185 @@ static void cik_prefetch_VBO_descriptors(struct si_context *sctx)
   */
  void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only)
  {
-       unsigned mask = sctx->prefetch_L2_mask;
-       assert(mask);
-
-       /* Prefetch shaders and VBO descriptors to TC L2. */
-       if (sctx->chip_class >= GFX9) {
-               /* Choose the right spot for the VBO prefetch. */
-               if (sctx->queued.named.hs) {
-                       if (mask & SI_PREFETCH_HS)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
-                       if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-                               cik_prefetch_VBO_descriptors(sctx);
-                       if (vertex_stage_only) {
-                               sctx->prefetch_L2_mask &= ~(SI_PREFETCH_HS |
-                                                           SI_PREFETCH_VBO_DESCRIPTORS);
-                               return;
-                       }
-
-                       if (mask & SI_PREFETCH_GS)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
-                       if (mask & SI_PREFETCH_VS)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
-               } else if (sctx->queued.named.gs) {
-                       if (mask & SI_PREFETCH_GS)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
-                       if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-                               cik_prefetch_VBO_descriptors(sctx);
-                       if (vertex_stage_only) {
-                               sctx->prefetch_L2_mask &= ~(SI_PREFETCH_GS |
-                                                           SI_PREFETCH_VBO_DESCRIPTORS);
-                               return;
-                       }
-
-                       if (mask & SI_PREFETCH_VS)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
-               } else {
-                       if (mask & SI_PREFETCH_VS)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
-                       if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-                               cik_prefetch_VBO_descriptors(sctx);
-                       if (vertex_stage_only) {
-                               sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS |
-                                                           SI_PREFETCH_VBO_DESCRIPTORS);
-                               return;
-                       }
-               }
-       } else {
-               /* GFX6-GFX8 */
-               /* Choose the right spot for the VBO prefetch. */
-               if (sctx->tes_shader.cso) {
-                       if (mask & SI_PREFETCH_LS)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
-                       if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-                               cik_prefetch_VBO_descriptors(sctx);
-                       if (vertex_stage_only) {
-                               sctx->prefetch_L2_mask &= ~(SI_PREFETCH_LS |
-                                                           SI_PREFETCH_VBO_DESCRIPTORS);
-                               return;
-                       }
-
-                       if (mask & SI_PREFETCH_HS)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
-                       if (mask & SI_PREFETCH_ES)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.es);
-                       if (mask & SI_PREFETCH_GS)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
-                       if (mask & SI_PREFETCH_VS)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
-               } else if (sctx->gs_shader.cso) {
-                       if (mask & SI_PREFETCH_ES)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.es);
-                       if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-                               cik_prefetch_VBO_descriptors(sctx);
-                       if (vertex_stage_only) {
-                               sctx->prefetch_L2_mask &= ~(SI_PREFETCH_ES |
-                                                           SI_PREFETCH_VBO_DESCRIPTORS);
-                               return;
-                       }
-
-                       if (mask & SI_PREFETCH_GS)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
-                       if (mask & SI_PREFETCH_VS)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
-               } else {
-                       if (mask & SI_PREFETCH_VS)
-                               cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
-                       if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-                               cik_prefetch_VBO_descriptors(sctx);
-                       if (vertex_stage_only) {
-                               sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS |
-                                                           SI_PREFETCH_VBO_DESCRIPTORS);
-                               return;
-                       }
-               }
-       }
-
-       if (mask & SI_PREFETCH_PS)
-               cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
-
-       sctx->prefetch_L2_mask = 0;
+   unsigned mask = sctx->prefetch_L2_mask;
+   assert(mask);
+
+   /* Prefetch shaders and VBO descriptors to TC L2. */
+   if (sctx->chip_class >= GFX9) {
+      /* Choose the right spot for the VBO prefetch. */
+      if (sctx->queued.named.hs) {
+         if (mask & SI_PREFETCH_HS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
+         if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+            cik_prefetch_VBO_descriptors(sctx);
+         if (vertex_stage_only) {
+            sctx->prefetch_L2_mask &= ~(SI_PREFETCH_HS | SI_PREFETCH_VBO_DESCRIPTORS);
+            return;
+         }
+
+         if (mask & SI_PREFETCH_GS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+         if (mask & SI_PREFETCH_VS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+      } else if (sctx->queued.named.gs) {
+         if (mask & SI_PREFETCH_GS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+         if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+            cik_prefetch_VBO_descriptors(sctx);
+         if (vertex_stage_only) {
+            sctx->prefetch_L2_mask &= ~(SI_PREFETCH_GS | SI_PREFETCH_VBO_DESCRIPTORS);
+            return;
+         }
+
+         if (mask & SI_PREFETCH_VS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+      } else {
+         if (mask & SI_PREFETCH_VS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+         if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+            cik_prefetch_VBO_descriptors(sctx);
+         if (vertex_stage_only) {
+            sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | SI_PREFETCH_VBO_DESCRIPTORS);
+            return;
+         }
+      }
+   } else {
+      /* GFX6-GFX8 */
+      /* Choose the right spot for the VBO prefetch. */
+      if (sctx->tes_shader.cso) {
+         if (mask & SI_PREFETCH_LS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
+         if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+            cik_prefetch_VBO_descriptors(sctx);
+         if (vertex_stage_only) {
+            sctx->prefetch_L2_mask &= ~(SI_PREFETCH_LS | SI_PREFETCH_VBO_DESCRIPTORS);
+            return;
+         }
+
+         if (mask & SI_PREFETCH_HS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
+         if (mask & SI_PREFETCH_ES)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.es);
+         if (mask & SI_PREFETCH_GS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+         if (mask & SI_PREFETCH_VS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+      } else if (sctx->gs_shader.cso) {
+         if (mask & SI_PREFETCH_ES)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.es);
+         if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+            cik_prefetch_VBO_descriptors(sctx);
+         if (vertex_stage_only) {
+            sctx->prefetch_L2_mask &= ~(SI_PREFETCH_ES | SI_PREFETCH_VBO_DESCRIPTORS);
+            return;
+         }
+
+         if (mask & SI_PREFETCH_GS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+         if (mask & SI_PREFETCH_VS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+      } else {
+         if (mask & SI_PREFETCH_VS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+         if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+            cik_prefetch_VBO_descriptors(sctx);
+         if (vertex_stage_only) {
+            sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | SI_PREFETCH_VBO_DESCRIPTORS);
+            return;
+         }
+      }
+   }
+
+   if (mask & SI_PREFETCH_PS)
+      cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
+
+   sctx->prefetch_L2_mask = 0;
  }
  
  void si_test_gds(struct si_context *sctx)
  {
-       struct pipe_context *ctx = &sctx->b;
-       struct pipe_resource *src, *dst;
-       unsigned r[4] = {};
-       unsigned offset = debug_get_num_option("OFFSET", 16);
-
-       src = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
-       dst = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
-       si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 0, 4, 0xabcdef01, 0, SI_COHERENCY_SHADER, L2_BYPASS);
-       si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 4, 4, 0x23456789, 0, SI_COHERENCY_SHADER, L2_BYPASS);
-       si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 8, 4, 0x87654321, 0, SI_COHERENCY_SHADER, L2_BYPASS);
-       si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 12, 4, 0xfedcba98, 0, SI_COHERENCY_SHADER, L2_BYPASS);
-       si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, 16, 0xdeadbeef, 0, SI_COHERENCY_SHADER, L2_BYPASS);
-
-       si_cp_dma_copy_buffer(sctx, NULL, src, offset, 0, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
-       si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
-
-       pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
-       printf("GDS copy  = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3],
-                       r[0] == 0xabcdef01 && r[1] == 0x23456789 &&
-                       r[2] == 0x87654321 && r[3] == 0xfedcba98 ? "pass" : "fail");
-
-       si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, NULL, offset, 16, 0xc1ea4146, 0, SI_COHERENCY_NONE, L2_BYPASS);
-       si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
-
-       pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
-       printf("GDS clear = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3],
-                       r[0] == 0xc1ea4146 && r[1] == 0xc1ea4146 &&
-                       r[2] == 0xc1ea4146 && r[3] == 0xc1ea4146 ? "pass" : "fail");
-
-       pipe_resource_reference(&src, NULL);
-       pipe_resource_reference(&dst, NULL);
-       exit(0);
+   struct pipe_context *ctx = &sctx->b;
+   struct pipe_resource *src, *dst;
+   unsigned r[4] = {};
+   unsigned offset = debug_get_num_option("OFFSET", 16);
+
+   src = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
+   dst = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
+   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 0, 4, 0xabcdef01, 0, SI_COHERENCY_SHADER,
+                          L2_BYPASS);
+   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 4, 4, 0x23456789, 0, SI_COHERENCY_SHADER,
+                          L2_BYPASS);
+   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 8, 4, 0x87654321, 0, SI_COHERENCY_SHADER,
+                          L2_BYPASS);
+   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 12, 4, 0xfedcba98, 0, SI_COHERENCY_SHADER,
+                          L2_BYPASS);
+   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, 16, 0xdeadbeef, 0, SI_COHERENCY_SHADER,
+                          L2_BYPASS);
+
+   si_cp_dma_copy_buffer(sctx, NULL, src, offset, 0, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
+   si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
+
+   pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
+   printf("GDS copy  = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3],
+          r[0] == 0xabcdef01 && r[1] == 0x23456789 && r[2] == 0x87654321 && r[3] == 0xfedcba98
+             ? "pass"
+             : "fail");
+
+   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, NULL, offset, 16, 0xc1ea4146, 0, SI_COHERENCY_NONE,
+                          L2_BYPASS);
+   si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
+
+   pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
+   printf("GDS clear = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3],
+          r[0] == 0xc1ea4146 && r[1] == 0xc1ea4146 && r[2] == 0xc1ea4146 && r[3] == 0xc1ea4146
+             ? "pass"
+             : "fail");
+
+   pipe_resource_reference(&src, NULL);
+   pipe_resource_reference(&dst, NULL);
+   exit(0);
  }
  
-void si_cp_write_data(struct si_context *sctx, struct si_resource *buf,
-                     unsigned offset, unsigned size, unsigned dst_sel,
-                     unsigned engine, const void *data)
+void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset,
+                      unsigned size, unsigned dst_sel, unsigned engine, const void *data)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
  
-       assert(offset % 4 == 0);
-       assert(size % 4 == 0);
+   assert(offset % 4 == 0);
+   assert(size % 4 == 0);
  
-       if (sctx->chip_class == GFX6 && dst_sel == V_370_MEM)
-               dst_sel = V_370_MEM_GRBM;
+   if (sctx->chip_class == GFX6 && dst_sel == V_370_MEM)
+      dst_sel = V_370_MEM_GRBM;
  
-       radeon_add_to_buffer_list(sctx, cs, buf,
-                                 RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
-       uint64_t va = buf->gpu_address + offset;
+   radeon_add_to_buffer_list(sctx, cs, buf, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+   uint64_t va = buf->gpu_address + offset;
  
-       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + size/4, 0));
-       radeon_emit(cs, S_370_DST_SEL(dst_sel) |
-                   S_370_WR_CONFIRM(1) |
-                   S_370_ENGINE_SEL(engine));
-       radeon_emit(cs, va);
-       radeon_emit(cs, va >> 32);
-       radeon_emit_array(cs, (const uint32_t*)data, size/4);
+   radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0));
+   radeon_emit(cs, S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine));
+   radeon_emit(cs, va);
+   radeon_emit(cs, va >> 32);
+   radeon_emit_array(cs, (const uint32_t *)data, size / 4);
  }
  
-void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs,
-                    unsigned dst_sel, struct si_resource *dst, unsigned dst_offset,
-                    unsigned src_sel, struct si_resource *src, unsigned src_offset)
+void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned dst_sel,
+                     struct si_resource *dst, unsigned dst_offset, unsigned src_sel,
+                     struct si_resource *src, unsigned src_offset)
  {
-       /* cs can point to the compute IB, which has the buffer list in gfx_cs. */
-       if (dst) {
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dst,
-                                         RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
-       }
-       if (src) {
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs, src,
-                                         RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
-       }
-
-       uint64_t dst_va = (dst ? dst->gpu_address : 0ull) + dst_offset;
-       uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset;
-
-       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-       radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) |
-                       COPY_DATA_DST_SEL(dst_sel) |
-                       COPY_DATA_WR_CONFIRM);
-       radeon_emit(cs, src_va);
-       radeon_emit(cs, src_va >> 32);
-       radeon_emit(cs, dst_va);
-       radeon_emit(cs, dst_va >> 32);
+   /* cs can point to the compute IB, which has the buffer list in gfx_cs. */
+   if (dst) {
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+   }
+   if (src) {
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
+   }
+
+   uint64_t dst_va = (dst ? dst->gpu_address : 0ull) + dst_offset;
+   uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset;
+
+   radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+   radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM);
+   radeon_emit(cs, src_va);
+   radeon_emit(cs, src_va >> 32);
+   radeon_emit(cs, dst_va);
+   radeon_emit(cs, dst_va >> 32);
  }
diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c

index cbd92c02c734b1e44f8b55f061ccef1e6598da84..acd86730d0bd84121a3770f9fcd5132131ffc7dc 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -22,21 +22,20 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "si_pipe.h"
+#include "ac_debug.h"
+#include "ac_rtld.h"
+#include "driver_ddebug/dd_util.h"
  #include "si_compute.h"
+#include "si_pipe.h"
  #include "sid.h"
  #include "sid_tables.h"
  #include "tgsi/tgsi_from_mesa.h"
-#include "driver_ddebug/dd_util.h"
  #include "util/u_dump.h"
  #include "util/u_log.h"
  #include "util/u_memory.h"
  #include "util/u_string.h"
-#include "ac_debug.h"
-#include "ac_rtld.h"
  
-static void si_dump_bo_list(struct si_context *sctx,
-                           const struct radeon_saved_cs *saved, FILE *f);
+static void si_dump_bo_list(struct si_context *sctx, const struct radeon_saved_cs *saved, FILE *f);
  
  DEBUG_GET_ONCE_OPTION(replace_shaders, "RADEON_REPLACE_SHADERS", NULL)
  
@@ -44,155 +43,148 @@ DEBUG_GET_ONCE_OPTION(replace_shaders, "RADEON_REPLACE_SHADERS", NULL)
   * Store a linearized copy of all chunks of \p cs together with the buffer
   * list in \p saved.
   */
-void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
-               struct radeon_saved_cs *saved, bool get_buffer_list)
+void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, struct radeon_saved_cs *saved,
+                bool get_buffer_list)
  {
-       uint32_t *buf;
-       unsigned i;
-
-       /* Save the IB chunks. */
-       saved->num_dw = cs->prev_dw + cs->current.cdw;
-       saved->ib = MALLOC(4 * saved->num_dw);
-       if (!saved->ib)
-               goto oom;
-
-       buf = saved->ib;
-       for (i = 0; i < cs->num_prev; ++i) {
-               memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
-               buf += cs->prev[i].cdw;
-       }
-       memcpy(buf, cs->current.buf, cs->current.cdw * 4);
-
-       if (!get_buffer_list)
-               return;
-
-       /* Save the buffer list. */
-       saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
-       saved->bo_list = CALLOC(saved->bo_count,
-                               sizeof(saved->bo_list[0]));
-       if (!saved->bo_list) {
-               FREE(saved->ib);
-               goto oom;
-       }
-       ws->cs_get_buffer_list(cs, saved->bo_list);
-
-       return;
+   uint32_t *buf;
+   unsigned i;
+
+   /* Save the IB chunks. */
+   saved->num_dw = cs->prev_dw + cs->current.cdw;
+   saved->ib = MALLOC(4 * saved->num_dw);
+   if (!saved->ib)
+      goto oom;
+
+   buf = saved->ib;
+   for (i = 0; i < cs->num_prev; ++i) {
+      memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
+      buf += cs->prev[i].cdw;
+   }
+   memcpy(buf, cs->current.buf, cs->current.cdw * 4);
+
+   if (!get_buffer_list)
+      return;
+
+   /* Save the buffer list. */
+   saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
+   saved->bo_list = CALLOC(saved->bo_count, sizeof(saved->bo_list[0]));
+   if (!saved->bo_list) {
+      FREE(saved->ib);
+      goto oom;
+   }
+   ws->cs_get_buffer_list(cs, saved->bo_list);
+
+   return;
  
  oom:
-       fprintf(stderr, "%s: out of memory\n", __func__);
-       memset(saved, 0, sizeof(*saved));
+   fprintf(stderr, "%s: out of memory\n", __func__);
+   memset(saved, 0, sizeof(*saved));
  }
  
  void si_clear_saved_cs(struct radeon_saved_cs *saved)
  {
-       FREE(saved->ib);
-       FREE(saved->bo_list);
+   FREE(saved->ib);
+   FREE(saved->bo_list);
  
-       memset(saved, 0, sizeof(*saved));
+   memset(saved, 0, sizeof(*saved));
  }
  
  void si_destroy_saved_cs(struct si_saved_cs *scs)
  {
-       si_clear_saved_cs(&scs->gfx);
-       si_resource_reference(&scs->trace_buf, NULL);
-       free(scs);
+   si_clear_saved_cs(&scs->gfx);
+   si_resource_reference(&scs->trace_buf, NULL);
+   free(scs);
  }
  
-static void si_dump_shader(struct si_screen *sscreen,
-                          struct si_shader *shader, FILE *f)
+static void si_dump_shader(struct si_screen *sscreen, struct si_shader *shader, FILE *f)
  {
-       if (shader->shader_log)
-               fwrite(shader->shader_log, shader->shader_log_size, 1, f);
-       else
-               si_shader_dump(sscreen, shader, NULL, f, false);
+   if (shader->shader_log)
+      fwrite(shader->shader_log, shader->shader_log_size, 1, f);
+   else
+      si_shader_dump(sscreen, shader, NULL, f, false);
  
-       if (shader->bo && sscreen->options.dump_shader_binary) {
-               unsigned size = shader->bo->b.b.width0;
-               fprintf(f, "BO: VA=%"PRIx64" Size=%u\n", shader->bo->gpu_address, size);
+   if (shader->bo && sscreen->options.dump_shader_binary) {
+      unsigned size = shader->bo->b.b.width0;
+      fprintf(f, "BO: VA=%" PRIx64 " Size=%u\n", shader->bo->gpu_address, size);
  
-               const char *mapped = sscreen->ws->buffer_map(shader->bo->buf, NULL,
-                                                      PIPE_TRANSFER_UNSYNCHRONIZED |
-                                                      PIPE_TRANSFER_READ |
-                                                      RADEON_TRANSFER_TEMPORARY);
+      const char *mapped = sscreen->ws->buffer_map(
+         shader->bo->buf, NULL,
+         PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_READ | RADEON_TRANSFER_TEMPORARY);
  
-               for (unsigned i = 0; i < size; i += 4) {
-                       fprintf(f, " %4x: %08x\n", i, *(uint32_t*)(mapped + i));
-               }
+      for (unsigned i = 0; i < size; i += 4) {
+         fprintf(f, " %4x: %08x\n", i, *(uint32_t *)(mapped + i));
+      }
  
-               sscreen->ws->buffer_unmap(shader->bo->buf);
+      sscreen->ws->buffer_unmap(shader->bo->buf);
  
-               fprintf(f, "\n");
-       }
+      fprintf(f, "\n");
+   }
  }
  
  struct si_log_chunk_shader {
-       /* The shader destroy code assumes a current context for unlinking of
-        * PM4 packets etc.
-        *
-        * While we should be able to destroy shaders without a context, doing
-        * so would happen only very rarely and be therefore likely to fail
-        * just when you're trying to debug something. Let's just remember the
-        * current context in the chunk.
-        */
-       struct si_context *ctx;
-       struct si_shader *shader;
-
-       /* For keep-alive reference counts */
-       struct si_shader_selector *sel;
-       struct si_compute *program;
+   /* The shader destroy code assumes a current context for unlinking of
+    * PM4 packets etc.
+    *
+    * While we should be able to destroy shaders without a context, doing
+    * so would happen only very rarely and be therefore likely to fail
+    * just when you're trying to debug something. Let's just remember the
+    * current context in the chunk.
+    */
+   struct si_context *ctx;
+   struct si_shader *shader;
+
+   /* For keep-alive reference counts */
+   struct si_shader_selector *sel;
+   struct si_compute *program;
  };
  
-static void
-si_log_chunk_shader_destroy(void *data)
+static void si_log_chunk_shader_destroy(void *data)
  {
-       struct si_log_chunk_shader *chunk = data;
-       si_shader_selector_reference(chunk->ctx, &chunk->sel, NULL);
-       si_compute_reference(&chunk->program, NULL);
-       FREE(chunk);
+   struct si_log_chunk_shader *chunk = data;
+   si_shader_selector_reference(chunk->ctx, &chunk->sel, NULL);
+   si_compute_reference(&chunk->program, NULL);
+   FREE(chunk);
  }
  
-static void
-si_log_chunk_shader_print(void *data, FILE *f)
+static void si_log_chunk_shader_print(void *data, FILE *f)
  {
-       struct si_log_chunk_shader *chunk = data;
-       struct si_screen *sscreen = chunk->ctx->screen;
-       si_dump_shader(sscreen, chunk->shader, f);
+   struct si_log_chunk_shader *chunk = data;
+   struct si_screen *sscreen = chunk->ctx->screen;
+   si_dump_shader(sscreen, chunk->shader, f);
  }
  
  static struct u_log_chunk_type si_log_chunk_type_shader = {
-       .destroy = si_log_chunk_shader_destroy,
-       .print = si_log_chunk_shader_print,
+   .destroy = si_log_chunk_shader_destroy,
+   .print = si_log_chunk_shader_print,
  };
  
-static void si_dump_gfx_shader(struct si_context *ctx,
-                              const struct si_shader_ctx_state *state,
-                              struct u_log_context *log)
+static void si_dump_gfx_shader(struct si_context *ctx, const struct si_shader_ctx_state *state,
+                               struct u_log_context *log)
  {
-       struct si_shader *current = state->current;
+   struct si_shader *current = state->current;
  
-       if (!state->cso || !current)
-               return;
+   if (!state->cso || !current)
+      return;
  
-       struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader);
-       chunk->ctx = ctx;
-       chunk->shader = current;
-       si_shader_selector_reference(ctx, &chunk->sel, current->selector);
-       u_log_chunk(log, &si_log_chunk_type_shader, chunk);
+   struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader);
+   chunk->ctx = ctx;
+   chunk->shader = current;
+   si_shader_selector_reference(ctx, &chunk->sel, current->selector);
+   u_log_chunk(log, &si_log_chunk_type_shader, chunk);
  }
  
-static void si_dump_compute_shader(struct si_context *ctx,
-                                  struct u_log_context *log)
+static void si_dump_compute_shader(struct si_context *ctx, struct u_log_context *log)
  {
-       const struct si_cs_shader_state *state = &ctx->cs_shader_state;
+   const struct si_cs_shader_state *state = &ctx->cs_shader_state;
  
-       if (!state->program)
-               return;
+   if (!state->program)
+      return;
  
-       struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader);
-       chunk->ctx = ctx;
-       chunk->shader = &state->program->shader;
-       si_compute_reference(&chunk->program, state->program);
-       u_log_chunk(log, &si_log_chunk_type_shader, chunk);
+   struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader);
+   chunk->ctx = ctx;
+   chunk->shader = &state->program->shader;
+   si_compute_reference(&chunk->program, state->program);
+   u_log_chunk(log, &si_log_chunk_type_shader, chunk);
  }
  
  /**
@@ -203,724 +195,664 @@ static void si_dump_compute_shader(struct si_context *ctx,
   */
  bool si_replace_shader(unsigned num, struct si_shader_binary *binary)
  {
-       const char *p = debug_get_option_replace_shaders();
-       const char *semicolon;
-       char *copy = NULL;
-       FILE *f;
-       long filesize, nread;
-       bool replaced = false;
-
-       if (!p)
-               return false;
-
-       while (*p) {
-               unsigned long i;
-               char *endp;
-               i = strtoul(p, &endp, 0);
-
-               p = endp;
-               if (*p != ':') {
-                       fprintf(stderr, "RADEON_REPLACE_SHADERS formatted badly.\n");
-                       exit(1);
-               }
-               ++p;
-
-               if (i == num)
-                       break;
-
-               p = strchr(p, ';');
-               if (!p)
-                       return false;
-               ++p;
-       }
-       if (!*p)
-               return false;
-
-       semicolon = strchr(p, ';');
-       if (semicolon) {
-               p = copy = strndup(p, semicolon - p);
-               if (!copy) {
-                       fprintf(stderr, "out of memory\n");
-                       return false;
-               }
-       }
-
-       fprintf(stderr, "radeonsi: replace shader %u by %s\n", num, p);
-
-       f = fopen(p, "r");
-       if (!f) {
-               perror("radeonsi: failed to open file");
-               goto out_free;
-       }
-
-       if (fseek(f, 0, SEEK_END) != 0)
-               goto file_error;
-
-       filesize = ftell(f);
-       if (filesize < 0)
-               goto file_error;
-
-       if (fseek(f, 0, SEEK_SET) != 0)
-               goto file_error;
-
-       binary->elf_buffer = MALLOC(filesize);
-       if (!binary->elf_buffer) {
-               fprintf(stderr, "out of memory\n");
-               goto out_close;
-       }
-
-       nread = fread((void*)binary->elf_buffer, 1, filesize, f);
-       if (nread != filesize) {
-               FREE((void*)binary->elf_buffer);
-               binary->elf_buffer = NULL;
-               goto file_error;
-       }
-
-       binary->elf_size = nread;
-       replaced = true;
+   const char *p = debug_get_option_replace_shaders();
+   const char *semicolon;
+   char *copy = NULL;
+   FILE *f;
+   long filesize, nread;
+   bool replaced = false;
+
+   if (!p)
+      return false;
+
+   while (*p) {
+      unsigned long i;
+      char *endp;
+      i = strtoul(p, &endp, 0);
+
+      p = endp;
+      if (*p != ':') {
+         fprintf(stderr, "RADEON_REPLACE_SHADERS formatted badly.\n");
+         exit(1);
+      }
+      ++p;
+
+      if (i == num)
+         break;
+
+      p = strchr(p, ';');
+      if (!p)
+         return false;
+      ++p;
+   }
+   if (!*p)
+      return false;
+
+   semicolon = strchr(p, ';');
+   if (semicolon) {
+      p = copy = strndup(p, semicolon - p);
+      if (!copy) {
+         fprintf(stderr, "out of memory\n");
+         return false;
+      }
+   }
+
+   fprintf(stderr, "radeonsi: replace shader %u by %s\n", num, p);
+
+   f = fopen(p, "r");
+   if (!f) {
+      perror("radeonsi: failed to open file");
+      goto out_free;
+   }
+
+   if (fseek(f, 0, SEEK_END) != 0)
+      goto file_error;
+
+   filesize = ftell(f);
+   if (filesize < 0)
+      goto file_error;
+
+   if (fseek(f, 0, SEEK_SET) != 0)
+      goto file_error;
+
+   binary->elf_buffer = MALLOC(filesize);
+   if (!binary->elf_buffer) {
+      fprintf(stderr, "out of memory\n");
+      goto out_close;
+   }
+
+   nread = fread((void *)binary->elf_buffer, 1, filesize, f);
+   if (nread != filesize) {
+      FREE((void *)binary->elf_buffer);
+      binary->elf_buffer = NULL;
+      goto file_error;
+   }
+
+   binary->elf_size = nread;
+   replaced = true;
  
  out_close:
-       fclose(f);
+   fclose(f);
  out_free:
-       free(copy);
-       return replaced;
+   free(copy);
+   return replaced;
  
  file_error:
-       perror("radeonsi: reading shader");
-       goto out_close;
+   perror("radeonsi: reading shader");
+   goto out_close;
  }
  
  /* Parsed IBs are difficult to read without colors. Use "less -R file" to
   * read them, or use "aha -b -f file" to convert them to html.
   */
-#define COLOR_RESET    "\033[0m"
-#define COLOR_RED      "\033[31m"
-#define COLOR_GREEN    "\033[1;32m"
-#define COLOR_YELLOW   "\033[1;33m"
-#define COLOR_CYAN     "\033[1;36m"
-
-static void si_dump_mmapped_reg(struct si_context *sctx, FILE *f,
-                               unsigned offset)
+#define COLOR_RESET  "\033[0m"
+#define COLOR_RED    "\033[31m"
+#define COLOR_GREEN  "\033[1;32m"
+#define COLOR_YELLOW "\033[1;33m"
+#define COLOR_CYAN   "\033[1;36m"
+
+static void si_dump_mmapped_reg(struct si_context *sctx, FILE *f, unsigned offset)
  {
-       struct radeon_winsys *ws = sctx->ws;
-       uint32_t value;
+   struct radeon_winsys *ws = sctx->ws;
+   uint32_t value;
  
-       if (ws->read_registers(ws, offset, 1, &value))
-               ac_dump_reg(f, sctx->chip_class, offset, value, ~0);
+   if (ws->read_registers(ws, offset, 1, &value))
+      ac_dump_reg(f, sctx->chip_class, offset, value, ~0);
  }
  
  static void si_dump_debug_registers(struct si_context *sctx, FILE *f)
  {
-       if (!sctx->screen->info.has_read_registers_query)
-               return;
-
-       fprintf(f, "Memory-mapped registers:\n");
-       si_dump_mmapped_reg(sctx, f, R_008010_GRBM_STATUS);
-
-       /* No other registers can be read on DRM < 3.1.0. */
-       if (!sctx->screen->info.is_amdgpu ||
-           sctx->screen->info.drm_minor < 1) {
-               fprintf(f, "\n");
-               return;
-       }
-
-       si_dump_mmapped_reg(sctx, f, R_008008_GRBM_STATUS2);
-       si_dump_mmapped_reg(sctx, f, R_008014_GRBM_STATUS_SE0);
-       si_dump_mmapped_reg(sctx, f, R_008018_GRBM_STATUS_SE1);
-       si_dump_mmapped_reg(sctx, f, R_008038_GRBM_STATUS_SE2);
-       si_dump_mmapped_reg(sctx, f, R_00803C_GRBM_STATUS_SE3);
-       si_dump_mmapped_reg(sctx, f, R_00D034_SDMA0_STATUS_REG);
-       si_dump_mmapped_reg(sctx, f, R_00D834_SDMA1_STATUS_REG);
-       if (sctx->chip_class <= GFX8) {
-               si_dump_mmapped_reg(sctx, f, R_000E50_SRBM_STATUS);
-               si_dump_mmapped_reg(sctx, f, R_000E4C_SRBM_STATUS2);
-               si_dump_mmapped_reg(sctx, f, R_000E54_SRBM_STATUS3);
-       }
-       si_dump_mmapped_reg(sctx, f, R_008680_CP_STAT);
-       si_dump_mmapped_reg(sctx, f, R_008674_CP_STALLED_STAT1);
-       si_dump_mmapped_reg(sctx, f, R_008678_CP_STALLED_STAT2);
-       si_dump_mmapped_reg(sctx, f, R_008670_CP_STALLED_STAT3);
-       si_dump_mmapped_reg(sctx, f, R_008210_CP_CPC_STATUS);
-       si_dump_mmapped_reg(sctx, f, R_008214_CP_CPC_BUSY_STAT);
-       si_dump_mmapped_reg(sctx, f, R_008218_CP_CPC_STALLED_STAT1);
-       si_dump_mmapped_reg(sctx, f, R_00821C_CP_CPF_STATUS);
-       si_dump_mmapped_reg(sctx, f, R_008220_CP_CPF_BUSY_STAT);
-       si_dump_mmapped_reg(sctx, f, R_008224_CP_CPF_STALLED_STAT1);
-       fprintf(f, "\n");
+   if (!sctx->screen->info.has_read_registers_query)
+      return;
+
+   fprintf(f, "Memory-mapped registers:\n");
+   si_dump_mmapped_reg(sctx, f, R_008010_GRBM_STATUS);
+
+   /* No other registers can be read on DRM < 3.1.0. */
+   if (!sctx->screen->info.is_amdgpu || sctx->screen->info.drm_minor < 1) {
+      fprintf(f, "\n");
+      return;
+   }
+
+   si_dump_mmapped_reg(sctx, f, R_008008_GRBM_STATUS2);
+   si_dump_mmapped_reg(sctx, f, R_008014_GRBM_STATUS_SE0);
+   si_dump_mmapped_reg(sctx, f, R_008018_GRBM_STATUS_SE1);
+   si_dump_mmapped_reg(sctx, f, R_008038_GRBM_STATUS_SE2);
+   si_dump_mmapped_reg(sctx, f, R_00803C_GRBM_STATUS_SE3);
+   si_dump_mmapped_reg(sctx, f, R_00D034_SDMA0_STATUS_REG);
+   si_dump_mmapped_reg(sctx, f, R_00D834_SDMA1_STATUS_REG);
+   if (sctx->chip_class <= GFX8) {
+      si_dump_mmapped_reg(sctx, f, R_000E50_SRBM_STATUS);
+      si_dump_mmapped_reg(sctx, f, R_000E4C_SRBM_STATUS2);
+      si_dump_mmapped_reg(sctx, f, R_000E54_SRBM_STATUS3);
+   }
+   si_dump_mmapped_reg(sctx, f, R_008680_CP_STAT);
+   si_dump_mmapped_reg(sctx, f, R_008674_CP_STALLED_STAT1);
+   si_dump_mmapped_reg(sctx, f, R_008678_CP_STALLED_STAT2);
+   si_dump_mmapped_reg(sctx, f, R_008670_CP_STALLED_STAT3);
+   si_dump_mmapped_reg(sctx, f, R_008210_CP_CPC_STATUS);
+   si_dump_mmapped_reg(sctx, f, R_008214_CP_CPC_BUSY_STAT);
+   si_dump_mmapped_reg(sctx, f, R_008218_CP_CPC_STALLED_STAT1);
+   si_dump_mmapped_reg(sctx, f, R_00821C_CP_CPF_STATUS);
+   si_dump_mmapped_reg(sctx, f, R_008220_CP_CPF_BUSY_STAT);
+   si_dump_mmapped_reg(sctx, f, R_008224_CP_CPF_STALLED_STAT1);
+   fprintf(f, "\n");
  }
  
  struct si_log_chunk_cs {
-       struct si_context *ctx;
-       struct si_saved_cs *cs;
-       bool dump_bo_list;
-       unsigned gfx_begin, gfx_end;
-       unsigned compute_begin, compute_end;
+   struct si_context *ctx;
+   struct si_saved_cs *cs;
+   bool dump_bo_list;
+   unsigned gfx_begin, gfx_end;
+   unsigned compute_begin, compute_end;
  };
  
  static void si_log_chunk_type_cs_destroy(void *data)
  {
-       struct si_log_chunk_cs *chunk = data;
-       si_saved_cs_reference(&chunk->cs, NULL);
-       free(chunk);
+   struct si_log_chunk_cs *chunk = data;
+   si_saved_cs_reference(&chunk->cs, NULL);
+   free(chunk);
  }
  
-static void si_parse_current_ib(FILE *f, struct radeon_cmdbuf *cs,
-                               unsigned begin, unsigned end,
-                               int *last_trace_id, unsigned trace_id_count,
-                               const char *name, enum chip_class chip_class)
+static void si_parse_current_ib(FILE *f, struct radeon_cmdbuf *cs, unsigned begin, unsigned end,
+                                int *last_trace_id, unsigned trace_id_count, const char *name,
+                                enum chip_class chip_class)
  {
-       unsigned orig_end = end;
+   unsigned orig_end = end;
  
-       assert(begin <= end);
+   assert(begin <= end);
  
-       fprintf(f, "------------------ %s begin (dw = %u) ------------------\n",
-               name, begin);
+   fprintf(f, "------------------ %s begin (dw = %u) ------------------\n", name, begin);
  
-       for (unsigned prev_idx = 0; prev_idx < cs->num_prev; ++prev_idx) {
-               struct radeon_cmdbuf_chunk *chunk = &cs->prev[prev_idx];
+   for (unsigned prev_idx = 0; prev_idx < cs->num_prev; ++prev_idx) {
+      struct radeon_cmdbuf_chunk *chunk = &cs->prev[prev_idx];
  
-               if (begin < chunk->cdw) {
-                       ac_parse_ib_chunk(f, chunk->buf + begin,
-                                         MIN2(end, chunk->cdw) - begin,
-                                         last_trace_id, trace_id_count,
-                                         chip_class, NULL, NULL);
-               }
+      if (begin < chunk->cdw) {
+         ac_parse_ib_chunk(f, chunk->buf + begin, MIN2(end, chunk->cdw) - begin, last_trace_id,
+                           trace_id_count, chip_class, NULL, NULL);
+      }
  
-               if (end <= chunk->cdw)
-                       return;
+      if (end <= chunk->cdw)
+         return;
  
-               if (begin < chunk->cdw)
-                       fprintf(f, "\n---------- Next %s Chunk ----------\n\n",
-                               name);
+      if (begin < chunk->cdw)
+         fprintf(f, "\n---------- Next %s Chunk ----------\n\n", name);
  
-               begin -= MIN2(begin, chunk->cdw);
-               end -= chunk->cdw;
-       }
+      begin -= MIN2(begin, chunk->cdw);
+      end -= chunk->cdw;
+   }
  
-       assert(end <= cs->current.cdw);
+   assert(end <= cs->current.cdw);
  
-       ac_parse_ib_chunk(f, cs->current.buf + begin, end - begin, last_trace_id,
-                         trace_id_count, chip_class, NULL, NULL);
+   ac_parse_ib_chunk(f, cs->current.buf + begin, end - begin, last_trace_id, trace_id_count,
+                     chip_class, NULL, NULL);
  
-       fprintf(f, "------------------- %s end (dw = %u) -------------------\n\n",
-               name, orig_end);
+   fprintf(f, "------------------- %s end (dw = %u) -------------------\n\n", name, orig_end);
  }
  
  static void si_log_chunk_type_cs_print(void *data, FILE *f)
  {
-       struct si_log_chunk_cs *chunk = data;
-       struct si_context *ctx = chunk->ctx;
-       struct si_saved_cs *scs = chunk->cs;
-       int last_trace_id = -1;
-       int last_compute_trace_id = -1;
-
-       /* We are expecting that the ddebug pipe has already
-        * waited for the context, so this buffer should be idle.
-        * If the GPU is hung, there is no point in waiting for it.
-        */
-       uint32_t *map = ctx->ws->buffer_map(scs->trace_buf->buf,
-                                             NULL,
-                                             PIPE_TRANSFER_UNSYNCHRONIZED |
-                                             PIPE_TRANSFER_READ);
-       if (map) {
-               last_trace_id = map[0];
-               last_compute_trace_id = map[1];
-       }
-
-       if (chunk->gfx_end != chunk->gfx_begin) {
-               if (chunk->gfx_begin == 0) {
-                       if (ctx->init_config)
-                               ac_parse_ib(f, ctx->init_config->pm4, ctx->init_config->ndw,
-                                           NULL, 0, "IB2: Init config", ctx->chip_class,
-                                           NULL, NULL);
-
-                       if (ctx->init_config_gs_rings)
-                               ac_parse_ib(f, ctx->init_config_gs_rings->pm4,
-                                           ctx->init_config_gs_rings->ndw,
-                                           NULL, 0, "IB2: Init GS rings", ctx->chip_class,
-                                           NULL, NULL);
-               }
-
-               if (scs->flushed) {
-                       ac_parse_ib(f, scs->gfx.ib + chunk->gfx_begin,
-                                   chunk->gfx_end - chunk->gfx_begin,
-                                   &last_trace_id, map ? 1 : 0, "IB", ctx->chip_class,
-                                   NULL, NULL);
-               } else {
-                       si_parse_current_ib(f, ctx->gfx_cs, chunk->gfx_begin,
-                                           chunk->gfx_end, &last_trace_id, map ? 1 : 0,
-                                           "IB", ctx->chip_class);
-               }
-       }
-
-       if (chunk->compute_end != chunk->compute_begin) {
-               assert(ctx->prim_discard_compute_cs);
-
-               if (scs->flushed) {
-                       ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
-                                   chunk->compute_end - chunk->compute_begin,
-                                   &last_compute_trace_id, map ? 1 : 0, "Compute IB", ctx->chip_class,
-                                   NULL, NULL);
-               } else {
-                       si_parse_current_ib(f, ctx->prim_discard_compute_cs, chunk->compute_begin,
-                                           chunk->compute_end, &last_compute_trace_id,
-                                           map ? 1 : 0, "Compute IB", ctx->chip_class);
-               }
-       }
-
-       if (chunk->dump_bo_list) {
-               fprintf(f, "Flushing. Time: ");
-               util_dump_ns(f, scs->time_flush);
-               fprintf(f, "\n\n");
-               si_dump_bo_list(ctx, &scs->gfx, f);
-       }
+   struct si_log_chunk_cs *chunk = data;
+   struct si_context *ctx = chunk->ctx;
+   struct si_saved_cs *scs = chunk->cs;
+   int last_trace_id = -1;
+   int last_compute_trace_id = -1;
+
+   /* We are expecting that the ddebug pipe has already
+    * waited for the context, so this buffer should be idle.
+    * If the GPU is hung, there is no point in waiting for it.
+    */
+   uint32_t *map = ctx->ws->buffer_map(scs->trace_buf->buf, NULL,
+                                       PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_READ);
+   if (map) {
+      last_trace_id = map[0];
+      last_compute_trace_id = map[1];
+   }
+
+   if (chunk->gfx_end != chunk->gfx_begin) {
+      if (chunk->gfx_begin == 0) {
+         if (ctx->init_config)
+            ac_parse_ib(f, ctx->init_config->pm4, ctx->init_config->ndw, NULL, 0,
+                        "IB2: Init config", ctx->chip_class, NULL, NULL);
+
+         if (ctx->init_config_gs_rings)
+            ac_parse_ib(f, ctx->init_config_gs_rings->pm4, ctx->init_config_gs_rings->ndw, NULL, 0,
+                        "IB2: Init GS rings", ctx->chip_class, NULL, NULL);
+      }
+
+      if (scs->flushed) {
+         ac_parse_ib(f, scs->gfx.ib + chunk->gfx_begin, chunk->gfx_end - chunk->gfx_begin,
+                     &last_trace_id, map ? 1 : 0, "IB", ctx->chip_class, NULL, NULL);
+      } else {
+         si_parse_current_ib(f, ctx->gfx_cs, chunk->gfx_begin, chunk->gfx_end, &last_trace_id,
+                             map ? 1 : 0, "IB", ctx->chip_class);
+      }
+   }
+
+   if (chunk->compute_end != chunk->compute_begin) {
+      assert(ctx->prim_discard_compute_cs);
+
+      if (scs->flushed) {
+         ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
+                     chunk->compute_end - chunk->compute_begin, &last_compute_trace_id, map ? 1 : 0,
+                     "Compute IB", ctx->chip_class, NULL, NULL);
+      } else {
+         si_parse_current_ib(f, ctx->prim_discard_compute_cs, chunk->compute_begin,
+                             chunk->compute_end, &last_compute_trace_id, map ? 1 : 0, "Compute IB",
+                             ctx->chip_class);
+      }
+   }
+
+   if (chunk->dump_bo_list) {
+      fprintf(f, "Flushing. Time: ");
+      util_dump_ns(f, scs->time_flush);
+      fprintf(f, "\n\n");
+      si_dump_bo_list(ctx, &scs->gfx, f);
+   }
  }
  
  static const struct u_log_chunk_type si_log_chunk_type_cs = {
-       .destroy = si_log_chunk_type_cs_destroy,
-       .print = si_log_chunk_type_cs_print,
+   .destroy = si_log_chunk_type_cs_destroy,
+   .print = si_log_chunk_type_cs_print,
  };
  
-static void si_log_cs(struct si_context *ctx, struct u_log_context *log,
-                     bool dump_bo_list)
+static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool dump_bo_list)
  {
-       assert(ctx->current_saved_cs);
+   assert(ctx->current_saved_cs);
  
-       struct si_saved_cs *scs = ctx->current_saved_cs;
-       unsigned gfx_cur = ctx->gfx_cs->prev_dw + ctx->gfx_cs->current.cdw;
-       unsigned compute_cur = 0;
+   struct si_saved_cs *scs = ctx->current_saved_cs;
+   unsigned gfx_cur = ctx->gfx_cs->prev_dw + ctx->gfx_cs->current.cdw;
+   unsigned compute_cur = 0;
  
-       if (ctx->prim_discard_compute_cs)
-               compute_cur = ctx->prim_discard_compute_cs->prev_dw + ctx->prim_discard_compute_cs->current.cdw;
+   if (ctx->prim_discard_compute_cs)
+      compute_cur =
+         ctx->prim_discard_compute_cs->prev_dw + ctx->prim_discard_compute_cs->current.cdw;
  
-       if (!dump_bo_list &&
-           gfx_cur == scs->gfx_last_dw &&
-           compute_cur == scs->compute_last_dw)
-               return;
+   if (!dump_bo_list && gfx_cur == scs->gfx_last_dw && compute_cur == scs->compute_last_dw)
+      return;
  
-       struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
+   struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
  
-       chunk->ctx = ctx;
-       si_saved_cs_reference(&chunk->cs, scs);
-       chunk->dump_bo_list = dump_bo_list;
+   chunk->ctx = ctx;
+   si_saved_cs_reference(&chunk->cs, scs);
+   chunk->dump_bo_list = dump_bo_list;
  
-       chunk->gfx_begin = scs->gfx_last_dw;
-       chunk->gfx_end = gfx_cur;
-       scs->gfx_last_dw = gfx_cur;
+   chunk->gfx_begin = scs->gfx_last_dw;
+   chunk->gfx_end = gfx_cur;
+   scs->gfx_last_dw = gfx_cur;
  
-       chunk->compute_begin = scs->compute_last_dw;
-       chunk->compute_end = compute_cur;
-       scs->compute_last_dw = compute_cur;
+   chunk->compute_begin = scs->compute_last_dw;
+   chunk->compute_end = compute_cur;
+   scs->compute_last_dw = compute_cur;
  
-       u_log_chunk(log, &si_log_chunk_type_cs, chunk);
+   u_log_chunk(log, &si_log_chunk_type_cs, chunk);
  }
  
  void si_auto_log_cs(void *data, struct u_log_context *log)
  {
-       struct si_context *ctx = (struct si_context *)data;
-       si_log_cs(ctx, log, false);
+   struct si_context *ctx = (struct si_context *)data;
+   si_log_cs(ctx, log, false);
  }
  
  void si_log_hw_flush(struct si_context *sctx)
  {
-       if (!sctx->log)
-               return;
-
-       si_log_cs(sctx, sctx->log, true);
-
-       if (&sctx->b == sctx->screen->aux_context) {
-               /* The aux context isn't captured by the ddebug wrapper,
-                * so we dump it on a flush-by-flush basis here.
-                */
-               FILE *f = dd_get_debug_file(false);
-               if (!f) {
-                       fprintf(stderr, "radeonsi: error opening aux context dump file.\n");
-               } else {
-                       dd_write_header(f, &sctx->screen->b, 0);
-
-                       fprintf(f, "Aux context dump:\n\n");
-                       u_log_new_page_print(sctx->log, f);
-
-                       fclose(f);
-               }
-       }
+   if (!sctx->log)
+      return;
+
+   si_log_cs(sctx, sctx->log, true);
+
+   if (&sctx->b == sctx->screen->aux_context) {
+      /* The aux context isn't captured by the ddebug wrapper,
+       * so we dump it on a flush-by-flush basis here.
+       */
+      FILE *f = dd_get_debug_file(false);
+      if (!f) {
+         fprintf(stderr, "radeonsi: error opening aux context dump file.\n");
+      } else {
+         dd_write_header(f, &sctx->screen->b, 0);
+
+         fprintf(f, "Aux context dump:\n\n");
+         u_log_new_page_print(sctx->log, f);
+
+         fclose(f);
+      }
+   }
  }
  
  static const char *priority_to_string(enum radeon_bo_priority priority)
  {
  #define ITEM(x) [RADEON_PRIO_##x] = #x
-       static const char *table[64] = {
-               ITEM(FENCE),
-               ITEM(TRACE),
-               ITEM(SO_FILLED_SIZE),
-               ITEM(QUERY),
-               ITEM(IB1),
-               ITEM(IB2),
-               ITEM(DRAW_INDIRECT),
-               ITEM(INDEX_BUFFER),
-               ITEM(CP_DMA),
-               ITEM(CONST_BUFFER),
-               ITEM(DESCRIPTORS),
-               ITEM(BORDER_COLORS),
-               ITEM(SAMPLER_BUFFER),
-               ITEM(VERTEX_BUFFER),
-               ITEM(SHADER_RW_BUFFER),
-               ITEM(COMPUTE_GLOBAL),
-               ITEM(SAMPLER_TEXTURE),
-               ITEM(SHADER_RW_IMAGE),
-               ITEM(SAMPLER_TEXTURE_MSAA),
-               ITEM(COLOR_BUFFER),
-               ITEM(DEPTH_BUFFER),
-               ITEM(COLOR_BUFFER_MSAA),
-               ITEM(DEPTH_BUFFER_MSAA),
-               ITEM(SEPARATE_META),
-               ITEM(SHADER_BINARY),
-               ITEM(SHADER_RINGS),
-               ITEM(SCRATCH_BUFFER),
-       };
+   static const char *table[64] = {
+      ITEM(FENCE),
+      ITEM(TRACE),
+      ITEM(SO_FILLED_SIZE),
+      ITEM(QUERY),
+      ITEM(IB1),
+      ITEM(IB2),
+      ITEM(DRAW_INDIRECT),
+      ITEM(INDEX_BUFFER),
+      ITEM(CP_DMA),
+      ITEM(CONST_BUFFER),
+      ITEM(DESCRIPTORS),
+      ITEM(BORDER_COLORS),
+      ITEM(SAMPLER_BUFFER),
+      ITEM(VERTEX_BUFFER),
+      ITEM(SHADER_RW_BUFFER),
+      ITEM(COMPUTE_GLOBAL),
+      ITEM(SAMPLER_TEXTURE),
+      ITEM(SHADER_RW_IMAGE),
+      ITEM(SAMPLER_TEXTURE_MSAA),
+      ITEM(COLOR_BUFFER),
+      ITEM(DEPTH_BUFFER),
+      ITEM(COLOR_BUFFER_MSAA),
+      ITEM(DEPTH_BUFFER_MSAA),
+      ITEM(SEPARATE_META),
+      ITEM(SHADER_BINARY),
+      ITEM(SHADER_RINGS),
+      ITEM(SCRATCH_BUFFER),
+   };
  #undef ITEM
  
-       assert(priority < ARRAY_SIZE(table));
-       return table[priority];
+   assert(priority < ARRAY_SIZE(table));
+   return table[priority];
  }
  
  static int bo_list_compare_va(const struct radeon_bo_list_item *a,
-                                  const struct radeon_bo_list_item *b)
+                              const struct radeon_bo_list_item *b)
  {
-       return a->vm_address < b->vm_address ? -1 :
-              a->vm_address > b->vm_address ? 1 : 0;
+   return a->vm_address < b->vm_address ? -1 : a->vm_address > b->vm_address ? 1 : 0;
  }
  
-static void si_dump_bo_list(struct si_context *sctx,
-                           const struct radeon_saved_cs *saved, FILE *f)
+static void si_dump_bo_list(struct si_context *sctx, const struct radeon_saved_cs *saved, FILE *f)
  {
-       unsigned i,j;
-
-       if (!saved->bo_list)
-               return;
-
-       /* Sort the list according to VM adddresses first. */
-       qsort(saved->bo_list, saved->bo_count,
-             sizeof(saved->bo_list[0]), (void*)bo_list_compare_va);
-
-       fprintf(f, "Buffer list (in units of pages = 4kB):\n"
-               COLOR_YELLOW "        Size    VM start page         "
-               "VM end page           Usage" COLOR_RESET "\n");
-
-       for (i = 0; i < saved->bo_count; i++) {
-               /* Note: Buffer sizes are expected to be aligned to 4k by the winsys. */
-               const unsigned page_size = sctx->screen->info.gart_page_size;
-               uint64_t va = saved->bo_list[i].vm_address;
-               uint64_t size = saved->bo_list[i].bo_size;
-               bool hit = false;
-
-               /* If there's unused virtual memory between 2 buffers, print it. */
-               if (i) {
-                       uint64_t previous_va_end = saved->bo_list[i-1].vm_address +
-                                                  saved->bo_list[i-1].bo_size;
-
-                       if (va > previous_va_end) {
-                               fprintf(f, "  %10"PRIu64"    -- hole --\n",
-                                       (va - previous_va_end) / page_size);
-                       }
-               }
-
-               /* Print the buffer. */
-               fprintf(f, "  %10"PRIu64"    0x%013"PRIX64"       0x%013"PRIX64"       ",
-                       size / page_size, va / page_size, (va + size) / page_size);
-
-               /* Print the usage. */
-               for (j = 0; j < 32; j++) {
-                       if (!(saved->bo_list[i].priority_usage & (1u << j)))
-                               continue;
-
-                       fprintf(f, "%s%s", !hit ? "" : ", ", priority_to_string(j));
-                       hit = true;
-               }
-               fprintf(f, "\n");
-       }
-       fprintf(f, "\nNote: The holes represent memory not used by the IB.\n"
-                  "      Other buffers can still be allocated there.\n\n");
+   unsigned i, j;
+
+   if (!saved->bo_list)
+      return;
+
+   /* Sort the list according to VM adddresses first. */
+   qsort(saved->bo_list, saved->bo_count, sizeof(saved->bo_list[0]), (void *)bo_list_compare_va);
+
+   fprintf(f, "Buffer list (in units of pages = 4kB):\n" COLOR_YELLOW
+              "        Size    VM start page         "
+              "VM end page           Usage" COLOR_RESET "\n");
+
+   for (i = 0; i < saved->bo_count; i++) {
+      /* Note: Buffer sizes are expected to be aligned to 4k by the winsys. */
+      const unsigned page_size = sctx->screen->info.gart_page_size;
+      uint64_t va = saved->bo_list[i].vm_address;
+      uint64_t size = saved->bo_list[i].bo_size;
+      bool hit = false;
+
+      /* If there's unused virtual memory between 2 buffers, print it. */
+      if (i) {
+         uint64_t previous_va_end =
+            saved->bo_list[i - 1].vm_address + saved->bo_list[i - 1].bo_size;
+
+         if (va > previous_va_end) {
+            fprintf(f, "  %10" PRIu64 "    -- hole --\n", (va - previous_va_end) / page_size);
+         }
+      }
+
+      /* Print the buffer. */
+      fprintf(f, "  %10" PRIu64 "    0x%013" PRIX64 "       0x%013" PRIX64 "       ",
+              size / page_size, va / page_size, (va + size) / page_size);
+
+      /* Print the usage. */
+      for (j = 0; j < 32; j++) {
+         if (!(saved->bo_list[i].priority_usage & (1u << j)))
+            continue;
+
+         fprintf(f, "%s%s", !hit ? "" : ", ", priority_to_string(j));
+         hit = true;
+      }
+      fprintf(f, "\n");
+   }
+   fprintf(f, "\nNote: The holes represent memory not used by the IB.\n"
+              "      Other buffers can still be allocated there.\n\n");
  }
  
  static void si_dump_framebuffer(struct si_context *sctx, struct u_log_context *log)
  {
-       struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
-       struct si_texture *tex;
-       int i;
-
-       for (i = 0; i < state->nr_cbufs; i++) {
-               if (!state->cbufs[i])
-                       continue;
-
-               tex = (struct si_texture*)state->cbufs[i]->texture;
-               u_log_printf(log, COLOR_YELLOW "Color buffer %i:" COLOR_RESET "\n", i);
-               si_print_texture_info(sctx->screen, tex, log);
-               u_log_printf(log, "\n");
-       }
-
-       if (state->zsbuf) {
-               tex = (struct si_texture*)state->zsbuf->texture;
-               u_log_printf(log, COLOR_YELLOW "Depth-stencil buffer:" COLOR_RESET "\n");
-               si_print_texture_info(sctx->screen, tex, log);
-               u_log_printf(log, "\n");
-       }
+   struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
+   struct si_texture *tex;
+   int i;
+
+   for (i = 0; i < state->nr_cbufs; i++) {
+      if (!state->cbufs[i])
+         continue;
+
+      tex = (struct si_texture *)state->cbufs[i]->texture;
+      u_log_printf(log, COLOR_YELLOW "Color buffer %i:" COLOR_RESET "\n", i);
+      si_print_texture_info(sctx->screen, tex, log);
+      u_log_printf(log, "\n");
+   }
+
+   if (state->zsbuf) {
+      tex = (struct si_texture *)state->zsbuf->texture;
+      u_log_printf(log, COLOR_YELLOW "Depth-stencil buffer:" COLOR_RESET "\n");
+      si_print_texture_info(sctx->screen, tex, log);
+      u_log_printf(log, "\n");
+   }
  }
  
  typedef unsigned (*slot_remap_func)(unsigned);
  
  struct si_log_chunk_desc_list {
-       /** Pointer to memory map of buffer where the list is uploader */
-       uint32_t *gpu_list;
-       /** Reference of buffer where the list is uploaded, so that gpu_list
-        * is kept live. */
-       struct si_resource *buf;
-
-       const char *shader_name;
-       const char *elem_name;
-       slot_remap_func slot_remap;
-       enum chip_class chip_class;
-       unsigned element_dw_size;
-       unsigned num_elements;
-
-       uint32_t list[0];
+   /** Pointer to memory map of buffer where the list is uploader */
+   uint32_t *gpu_list;
+   /** Reference of buffer where the list is uploaded, so that gpu_list
+    * is kept live. */
+   struct si_resource *buf;
+
+   const char *shader_name;
+   const char *elem_name;
+   slot_remap_func slot_remap;
+   enum chip_class chip_class;
+   unsigned element_dw_size;
+   unsigned num_elements;
+
+   uint32_t list[0];
  };
  
-static void
-si_log_chunk_desc_list_destroy(void *data)
+static void si_log_chunk_desc_list_destroy(void *data)
  {
-       struct si_log_chunk_desc_list *chunk = data;
-       si_resource_reference(&chunk->buf, NULL);
-       FREE(chunk);
+   struct si_log_chunk_desc_list *chunk = data;
+   si_resource_reference(&chunk->buf, NULL);
+   FREE(chunk);
  }
  
-static void
-si_log_chunk_desc_list_print(void *data, FILE *f)
+static void si_log_chunk_desc_list_print(void *data, FILE *f)
  {
-       struct si_log_chunk_desc_list *chunk = data;
-       unsigned sq_img_rsrc_word0 = chunk->chip_class >= GFX10 ? R_00A000_SQ_IMG_RSRC_WORD0
-                                                               : R_008F10_SQ_IMG_RSRC_WORD0;
-
-       for (unsigned i = 0; i < chunk->num_elements; i++) {
-               unsigned cpu_dw_offset = i * chunk->element_dw_size;
-               unsigned gpu_dw_offset = chunk->slot_remap(i) * chunk->element_dw_size;
-               const char *list_note = chunk->gpu_list ? "GPU list" : "CPU list";
-               uint32_t *cpu_list = chunk->list + cpu_dw_offset;
-               uint32_t *gpu_list = chunk->gpu_list ? chunk->gpu_list + gpu_dw_offset : cpu_list;
-
-               fprintf(f, COLOR_GREEN "%s%s slot %u (%s):" COLOR_RESET "\n",
-                       chunk->shader_name, chunk->elem_name, i, list_note);
-
-               switch (chunk->element_dw_size) {
-               case 4:
-                       for (unsigned j = 0; j < 4; j++)
-                               ac_dump_reg(f, chunk->chip_class,
-                                           R_008F00_SQ_BUF_RSRC_WORD0 + j*4,
-                                           gpu_list[j], 0xffffffff);
-                       break;
-               case 8:
-                       for (unsigned j = 0; j < 8; j++)
-                               ac_dump_reg(f, chunk->chip_class,
-                                           sq_img_rsrc_word0 + j*4,
-                                           gpu_list[j], 0xffffffff);
-
-                       fprintf(f, COLOR_CYAN "    Buffer:" COLOR_RESET "\n");
-                       for (unsigned j = 0; j < 4; j++)
-                               ac_dump_reg(f, chunk->chip_class,
-                                           R_008F00_SQ_BUF_RSRC_WORD0 + j*4,
-                                           gpu_list[4+j], 0xffffffff);
-                       break;
-               case 16:
-                       for (unsigned j = 0; j < 8; j++)
-                               ac_dump_reg(f, chunk->chip_class,
-                                           sq_img_rsrc_word0 + j*4,
-                                           gpu_list[j], 0xffffffff);
-
-                       fprintf(f, COLOR_CYAN "    Buffer:" COLOR_RESET "\n");
-                       for (unsigned j = 0; j < 4; j++)
-                               ac_dump_reg(f, chunk->chip_class,
-                                           R_008F00_SQ_BUF_RSRC_WORD0 + j*4,
-                                           gpu_list[4+j], 0xffffffff);
-
-                       fprintf(f, COLOR_CYAN "    FMASK:" COLOR_RESET "\n");
-                       for (unsigned j = 0; j < 8; j++)
-                               ac_dump_reg(f, chunk->chip_class,
-                                           sq_img_rsrc_word0 + j*4,
-                                           gpu_list[8+j], 0xffffffff);
-
-                       fprintf(f, COLOR_CYAN "    Sampler state:" COLOR_RESET "\n");
-                       for (unsigned j = 0; j < 4; j++)
-                               ac_dump_reg(f, chunk->chip_class,
-                                           R_008F30_SQ_IMG_SAMP_WORD0 + j*4,
-                                           gpu_list[12+j], 0xffffffff);
-                       break;
-               }
-
-               if (memcmp(gpu_list, cpu_list, chunk->element_dw_size * 4) != 0) {
-                       fprintf(f, COLOR_RED "!!!!! This slot was corrupted in GPU memory !!!!!"
-                               COLOR_RESET "\n");
-               }
-
-               fprintf(f, "\n");
-       }
-
+   struct si_log_chunk_desc_list *chunk = data;
+   unsigned sq_img_rsrc_word0 =
+      chunk->chip_class >= GFX10 ? R_00A000_SQ_IMG_RSRC_WORD0 : R_008F10_SQ_IMG_RSRC_WORD0;
+
+   for (unsigned i = 0; i < chunk->num_elements; i++) {
+      unsigned cpu_dw_offset = i * chunk->element_dw_size;
+      unsigned gpu_dw_offset = chunk->slot_remap(i) * chunk->element_dw_size;
+      const char *list_note = chunk->gpu_list ? "GPU list" : "CPU list";
+      uint32_t *cpu_list = chunk->list + cpu_dw_offset;
+      uint32_t *gpu_list = chunk->gpu_list ? chunk->gpu_list + gpu_dw_offset : cpu_list;
+
+      fprintf(f, COLOR_GREEN "%s%s slot %u (%s):" COLOR_RESET "\n", chunk->shader_name,
+              chunk->elem_name, i, list_note);
+
+      switch (chunk->element_dw_size) {
+      case 4:
+         for (unsigned j = 0; j < 4; j++)
+            ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[j],
+                        0xffffffff);
+         break;
+      case 8:
+         for (unsigned j = 0; j < 8; j++)
+            ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[j], 0xffffffff);
+
+         fprintf(f, COLOR_CYAN "    Buffer:" COLOR_RESET "\n");
+         for (unsigned j = 0; j < 4; j++)
+            ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[4 + j],
+                        0xffffffff);
+         break;
+      case 16:
+         for (unsigned j = 0; j < 8; j++)
+            ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[j], 0xffffffff);
+
+         fprintf(f, COLOR_CYAN "    Buffer:" COLOR_RESET "\n");
+         for (unsigned j = 0; j < 4; j++)
+            ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[4 + j],
+                        0xffffffff);
+
+         fprintf(f, COLOR_CYAN "    FMASK:" COLOR_RESET "\n");
+         for (unsigned j = 0; j < 8; j++)
+            ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[8 + j],
+                        0xffffffff);
+
+         fprintf(f, COLOR_CYAN "    Sampler state:" COLOR_RESET "\n");
+         for (unsigned j = 0; j < 4; j++)
+            ac_dump_reg(f, chunk->chip_class, R_008F30_SQ_IMG_SAMP_WORD0 + j * 4, gpu_list[12 + j],
+                        0xffffffff);
+         break;
+      }
+
+      if (memcmp(gpu_list, cpu_list, chunk->element_dw_size * 4) != 0) {
+         fprintf(f, COLOR_RED "!!!!! This slot was corrupted in GPU memory !!!!!" COLOR_RESET "\n");
+      }
+
+      fprintf(f, "\n");
+   }
  }
  
  static const struct u_log_chunk_type si_log_chunk_type_descriptor_list = {
-       .destroy = si_log_chunk_desc_list_destroy,
-       .print = si_log_chunk_desc_list_print,
+   .destroy = si_log_chunk_desc_list_destroy,
+   .print = si_log_chunk_desc_list_print,
  };
  
-static void si_dump_descriptor_list(struct si_screen *screen,
-                                   struct si_descriptors *desc,
-                                   const char *shader_name,
-                                   const char *elem_name,
-                                   unsigned element_dw_size,
-                                   unsigned num_elements,
-                                   slot_remap_func slot_remap,
-                                   struct u_log_context *log)
+static void si_dump_descriptor_list(struct si_screen *screen, struct si_descriptors *desc,
+                                    const char *shader_name, const char *elem_name,
+                                    unsigned element_dw_size, unsigned num_elements,
+                                    slot_remap_func slot_remap, struct u_log_context *log)
  {
-       if (!desc->list)
-               return;
-
-       /* In some cases, the caller doesn't know how many elements are really
-        * uploaded. Reduce num_elements to fit in the range of active slots. */
-       unsigned active_range_dw_begin =
-               desc->first_active_slot * desc->element_dw_size;
-       unsigned active_range_dw_end =
-               active_range_dw_begin + desc->num_active_slots * desc->element_dw_size;
-
-       while (num_elements > 0) {
-               int i = slot_remap(num_elements - 1);
-               unsigned dw_begin = i * element_dw_size;
-               unsigned dw_end = dw_begin + element_dw_size;
-
-               if (dw_begin >= active_range_dw_begin && dw_end <= active_range_dw_end)
-                       break;
-
-               num_elements--;
-       }
-
-       struct si_log_chunk_desc_list *chunk =
-               CALLOC_VARIANT_LENGTH_STRUCT(si_log_chunk_desc_list,
-                                            4 * element_dw_size * num_elements);
-       chunk->shader_name = shader_name;
-       chunk->elem_name = elem_name;
-       chunk->element_dw_size = element_dw_size;
-       chunk->num_elements = num_elements;
-       chunk->slot_remap = slot_remap;
-       chunk->chip_class = screen->info.chip_class;
-
-       si_resource_reference(&chunk->buf, desc->buffer);
-       chunk->gpu_list = desc->gpu_list;
-
-       for (unsigned i = 0; i < num_elements; ++i) {
-               memcpy(&chunk->list[i * element_dw_size],
-                      &desc->list[slot_remap(i) * element_dw_size],
-                      4 * element_dw_size);
-       }
-
-       u_log_chunk(log, &si_log_chunk_type_descriptor_list, chunk);
+   if (!desc->list)
+      return;
+
+   /* In some cases, the caller doesn't know how many elements are really
+    * uploaded. Reduce num_elements to fit in the range of active slots. */
+   unsigned active_range_dw_begin = desc->first_active_slot * desc->element_dw_size;
+   unsigned active_range_dw_end =
+      active_range_dw_begin + desc->num_active_slots * desc->element_dw_size;
+
+   while (num_elements > 0) {
+      int i = slot_remap(num_elements - 1);
+      unsigned dw_begin = i * element_dw_size;
+      unsigned dw_end = dw_begin + element_dw_size;
+
+      if (dw_begin >= active_range_dw_begin && dw_end <= active_range_dw_end)
+         break;
+
+      num_elements--;
+   }
+
+   struct si_log_chunk_desc_list *chunk =
+      CALLOC_VARIANT_LENGTH_STRUCT(si_log_chunk_desc_list, 4 * element_dw_size * num_elements);
+   chunk->shader_name = shader_name;
+   chunk->elem_name = elem_name;
+   chunk->element_dw_size = element_dw_size;
+   chunk->num_elements = num_elements;
+   chunk->slot_remap = slot_remap;
+   chunk->chip_class = screen->info.chip_class;
+
+   si_resource_reference(&chunk->buf, desc->buffer);
+   chunk->gpu_list = desc->gpu_list;
+
+   for (unsigned i = 0; i < num_elements; ++i) {
+      memcpy(&chunk->list[i * element_dw_size], &desc->list[slot_remap(i) * element_dw_size],
+             4 * element_dw_size);
+   }
+
+   u_log_chunk(log, &si_log_chunk_type_descriptor_list, chunk);
  }
  
  static unsigned si_identity(unsigned slot)
  {
-       return slot;
+   return slot;
  }
  
-static void si_dump_descriptors(struct si_context *sctx,
-                               enum pipe_shader_type processor,
-                               const struct si_shader_info *info,
-                               struct u_log_context *log)
+static void si_dump_descriptors(struct si_context *sctx, enum pipe_shader_type processor,
+                                const struct si_shader_info *info, struct u_log_context *log)
  {
-       struct si_descriptors *descs =
-               &sctx->descriptors[SI_DESCS_FIRST_SHADER +
-                                  processor * SI_NUM_SHADER_DESCS];
-       static const char *shader_name[] = {"VS", "PS", "GS", "TCS", "TES", "CS"};
-       const char *name = shader_name[processor];
-       unsigned enabled_constbuf, enabled_shaderbuf, enabled_samplers;
-       unsigned enabled_images;
-
-       if (info) {
-               enabled_constbuf = info->const_buffers_declared;
-               enabled_shaderbuf = info->shader_buffers_declared;
-               enabled_samplers = info->samplers_declared;
-               enabled_images = info->images_declared;
-       } else {
-               enabled_constbuf = sctx->const_and_shader_buffers[processor].enabled_mask >>
-                                  SI_NUM_SHADER_BUFFERS;
-               enabled_shaderbuf = sctx->const_and_shader_buffers[processor].enabled_mask &
-                                   u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS);
-               enabled_shaderbuf = util_bitreverse(enabled_shaderbuf) >>
-                                   (32 - SI_NUM_SHADER_BUFFERS);
-               enabled_samplers = sctx->samplers[processor].enabled_mask;
-               enabled_images = sctx->images[processor].enabled_mask;
-       }
-
-       if (processor == PIPE_SHADER_VERTEX &&
-           sctx->vb_descriptors_buffer &&
-           sctx->vb_descriptors_gpu_list &&
-           sctx->vertex_elements) {
-               assert(info); /* only CS may not have an info struct */
-               struct si_descriptors desc = {};
-
-               desc.buffer = sctx->vb_descriptors_buffer;
-               desc.list = sctx->vb_descriptors_gpu_list;
-               desc.gpu_list = sctx->vb_descriptors_gpu_list;
-               desc.element_dw_size = 4;
-               desc.num_active_slots = sctx->vertex_elements->vb_desc_list_alloc_size / 16;
-
-               si_dump_descriptor_list(sctx->screen, &desc, name,
-                                       " - Vertex buffer", 4, info->num_inputs,
-                                       si_identity, log);
-       }
-
-       si_dump_descriptor_list(sctx->screen,
-                               &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS],
-                               name, " - Constant buffer", 4,
-                               util_last_bit(enabled_constbuf),
-                               si_get_constbuf_slot, log);
-       si_dump_descriptor_list(sctx->screen,
-                               &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS],
-                               name, " - Shader buffer", 4,
-                               util_last_bit(enabled_shaderbuf),
-                               si_get_shaderbuf_slot, log);
-       si_dump_descriptor_list(sctx->screen,
-                               &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES],
-                               name, " - Sampler", 16,
-                               util_last_bit(enabled_samplers),
-                               si_get_sampler_slot, log);
-       si_dump_descriptor_list(sctx->screen,
-                               &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES],
-                               name, " - Image", 8,
-                               util_last_bit(enabled_images),
-                               si_get_image_slot, log);
+   struct si_descriptors *descs =
+      &sctx->descriptors[SI_DESCS_FIRST_SHADER + processor * SI_NUM_SHADER_DESCS];
+   static const char *shader_name[] = {"VS", "PS", "GS", "TCS", "TES", "CS"};
+   const char *name = shader_name[processor];
+   unsigned enabled_constbuf, enabled_shaderbuf, enabled_samplers;
+   unsigned enabled_images;
+
+   if (info) {
+      enabled_constbuf = info->const_buffers_declared;
+      enabled_shaderbuf = info->shader_buffers_declared;
+      enabled_samplers = info->samplers_declared;
+      enabled_images = info->images_declared;
+   } else {
+      enabled_constbuf =
+         sctx->const_and_shader_buffers[processor].enabled_mask >> SI_NUM_SHADER_BUFFERS;
+      enabled_shaderbuf = sctx->const_and_shader_buffers[processor].enabled_mask &
+                          u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS);
+      enabled_shaderbuf = util_bitreverse(enabled_shaderbuf) >> (32 - SI_NUM_SHADER_BUFFERS);
+      enabled_samplers = sctx->samplers[processor].enabled_mask;
+      enabled_images = sctx->images[processor].enabled_mask;
+   }
+
+   if (processor == PIPE_SHADER_VERTEX && sctx->vb_descriptors_buffer &&
+       sctx->vb_descriptors_gpu_list && sctx->vertex_elements) {
+      assert(info); /* only CS may not have an info struct */
+      struct si_descriptors desc = {};
+
+      desc.buffer = sctx->vb_descriptors_buffer;
+      desc.list = sctx->vb_descriptors_gpu_list;
+      desc.gpu_list = sctx->vb_descriptors_gpu_list;
+      desc.element_dw_size = 4;
+      desc.num_active_slots = sctx->vertex_elements->vb_desc_list_alloc_size / 16;
+
+      si_dump_descriptor_list(sctx->screen, &desc, name, " - Vertex buffer", 4, info->num_inputs,
+                              si_identity, log);
+   }
+
+   si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS], name,
+                           " - Constant buffer", 4, util_last_bit(enabled_constbuf),
+                           si_get_constbuf_slot, log);
+   si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS], name,
+                           " - Shader buffer", 4, util_last_bit(enabled_shaderbuf),
+                           si_get_shaderbuf_slot, log);
+   si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES], name,
+                           " - Sampler", 16, util_last_bit(enabled_samplers), si_get_sampler_slot,
+                           log);
+   si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES], name,
+                           " - Image", 8, util_last_bit(enabled_images), si_get_image_slot, log);
  }
  
  static void si_dump_gfx_descriptors(struct si_context *sctx,
-                                   const struct si_shader_ctx_state *state,
-                                   struct u_log_context *log)
+                                    const struct si_shader_ctx_state *state,
+                                    struct u_log_context *log)
  {
-       if (!state->cso || !state->current)
-               return;
+   if (!state->cso || !state->current)
+      return;
  
-       si_dump_descriptors(sctx, state->cso->type, &state->cso->info, log);
+   si_dump_descriptors(sctx, state->cso->type, &state->cso->info, log);
  }
  
-static void si_dump_compute_descriptors(struct si_context *sctx,
-                                       struct u_log_context *log)
+static void si_dump_compute_descriptors(struct si_context *sctx, struct u_log_context *log)
  {
-       if (!sctx->cs_shader_state.program)
-               return;
+   if (!sctx->cs_shader_state.program)
+      return;
  
-       si_dump_descriptors(sctx, PIPE_SHADER_COMPUTE, NULL, log);
+   si_dump_descriptors(sctx, PIPE_SHADER_COMPUTE, NULL, log);
  }
  
  struct si_shader_inst {
-       const char *text; /* start of disassembly for this instruction */
-       unsigned textlen;
-       unsigned size;   /* instruction size = 4 or 8 */
-       uint64_t addr; /* instruction address */
+   const char *text; /* start of disassembly for this instruction */
+   unsigned textlen;
+   unsigned size; /* instruction size = 4 or 8 */
+   uint64_t addr; /* instruction address */
  };
  
  /**
@@ -933,344 +865,323 @@ struct si_shader_inst {
   * The caller must keep \p rtld_binary alive as long as \p instructions are
   * used and then close it afterwards.
   */
-static void si_add_split_disasm(struct si_screen *screen,
-                               struct ac_rtld_binary *rtld_binary,
-                               struct si_shader_binary *binary,
-                               uint64_t *addr,
-                               unsigned *num,
-                               struct si_shader_inst *instructions,
-                               enum pipe_shader_type shader_type,
-                               unsigned wave_size)
+static void si_add_split_disasm(struct si_screen *screen, struct ac_rtld_binary *rtld_binary,
+                                struct si_shader_binary *binary, uint64_t *addr, unsigned *num,
+                                struct si_shader_inst *instructions,
+                                enum pipe_shader_type shader_type, unsigned wave_size)
  {
-       if (!ac_rtld_open(rtld_binary, (struct ac_rtld_open_info){
-                       .info = &screen->info,
-                       .shader_type = tgsi_processor_to_shader_stage(shader_type),
-                       .wave_size = wave_size,
-                       .num_parts = 1,
-                       .elf_ptrs = &binary->elf_buffer,
-                       .elf_sizes = &binary->elf_size }))
-               return;
-
-       const char *disasm;
-       size_t nbytes;
-       if (!ac_rtld_get_section_by_name(rtld_binary, ".AMDGPU.disasm",
-                                        &disasm, &nbytes))
-               return;
-
-       const char *end = disasm + nbytes;
-       while (disasm < end) {
-               const char *semicolon = memchr(disasm, ';', end - disasm);
-               if (!semicolon)
-                       break;
-
-               struct si_shader_inst *inst = &instructions[(*num)++];
-               const char *inst_end = memchr(semicolon + 1, '\n', end - semicolon - 1);
-               if (!inst_end)
-                       inst_end = end;
-
-               inst->text = disasm;
-               inst->textlen = inst_end - disasm;
-
-               inst->addr = *addr;
-               /* More than 16 chars after ";" means the instruction is 8 bytes long. */
-               inst->size = inst_end - semicolon > 16 ? 8 : 4;
-               *addr += inst->size;
-
-               if (inst_end == end)
-                       break;
-               disasm = inst_end + 1;
-       }
+   if (!ac_rtld_open(rtld_binary, (struct ac_rtld_open_info){
+                                     .info = &screen->info,
+                                     .shader_type = tgsi_processor_to_shader_stage(shader_type),
+                                     .wave_size = wave_size,
+                                     .num_parts = 1,
+                                     .elf_ptrs = &binary->elf_buffer,
+                                     .elf_sizes = &binary->elf_size}))
+      return;
+
+   const char *disasm;
+   size_t nbytes;
+   if (!ac_rtld_get_section_by_name(rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
+      return;
+
+   const char *end = disasm + nbytes;
+   while (disasm < end) {
+      const char *semicolon = memchr(disasm, ';', end - disasm);
+      if (!semicolon)
+         break;
+
+      struct si_shader_inst *inst = &instructions[(*num)++];
+      const char *inst_end = memchr(semicolon + 1, '\n', end - semicolon - 1);
+      if (!inst_end)
+         inst_end = end;
+
+      inst->text = disasm;
+      inst->textlen = inst_end - disasm;
+
+      inst->addr = *addr;
+      /* More than 16 chars after ";" means the instruction is 8 bytes long. */
+      inst->size = inst_end - semicolon > 16 ? 8 : 4;
+      *addr += inst->size;
+
+      if (inst_end == end)
+         break;
+      disasm = inst_end + 1;
+   }
  }
  
  /* If the shader is being executed, print its asm instructions, and annotate
   * those that are being executed right now with information about waves that
   * execute them. This is most useful during a GPU hang.
   */
-static void si_print_annotated_shader(struct si_shader *shader,
-                                     struct ac_wave_info *waves,
-                                     unsigned num_waves,
-                                     FILE *f)
+static void si_print_annotated_shader(struct si_shader *shader, struct ac_wave_info *waves,
+                                      unsigned num_waves, FILE *f)
  {
-       if (!shader)
-               return;
-
-       struct si_screen *screen = shader->selector->screen;
-       enum pipe_shader_type shader_type = shader->selector->type;
-       uint64_t start_addr = shader->bo->gpu_address;
-       uint64_t end_addr = start_addr + shader->bo->b.b.width0;
-       unsigned i;
-
-       /* See if any wave executes the shader. */
-       for (i = 0; i < num_waves; i++) {
-               if (start_addr <= waves[i].pc && waves[i].pc <= end_addr)
-                       break;
-       }
-       if (i == num_waves)
-               return; /* the shader is not being executed */
-
-       /* Remember the first found wave. The waves are sorted according to PC. */
-       waves = &waves[i];
-       num_waves -= i;
-
-       /* Get the list of instructions.
-        * Buffer size / 4 is the upper bound of the instruction count.
-        */
-       unsigned num_inst = 0;
-       uint64_t inst_addr = start_addr;
-       unsigned wave_size = si_get_shader_wave_size(shader);
-       struct ac_rtld_binary rtld_binaries[5] = {};
-       struct si_shader_inst *instructions =
-               calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst));
-
-       if (shader->prolog) {
-               si_add_split_disasm(screen, &rtld_binaries[0], &shader->prolog->binary,
-                                   &inst_addr, &num_inst, instructions, shader_type, wave_size);
-       }
-       if (shader->previous_stage) {
-               si_add_split_disasm(screen, &rtld_binaries[1], &shader->previous_stage->binary,
-                                   &inst_addr, &num_inst, instructions, shader_type, wave_size);
-       }
-       if (shader->prolog2) {
-               si_add_split_disasm(screen, &rtld_binaries[2], &shader->prolog2->binary,
-                                   &inst_addr, &num_inst, instructions, shader_type, wave_size);
-       }
-       si_add_split_disasm(screen, &rtld_binaries[3], &shader->binary,
-                           &inst_addr, &num_inst, instructions, shader_type, wave_size);
-       if (shader->epilog) {
-               si_add_split_disasm(screen, &rtld_binaries[4], &shader->epilog->binary,
-                                   &inst_addr, &num_inst, instructions, shader_type, wave_size);
-       }
-
-       fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n",
-               si_get_shader_name(shader));
-
-       /* Print instructions with annotations. */
-       for (i = 0; i < num_inst; i++) {
-               struct si_shader_inst *inst = &instructions[i];
-
-               fprintf(f, "%.*s [PC=0x%"PRIx64", size=%u]\n",
-                       inst->textlen, inst->text, inst->addr, inst->size);
-
-               /* Print which waves execute the instruction right now. */
-               while (num_waves && inst->addr == waves->pc) {
-                       fprintf(f,
-                               "          " COLOR_GREEN "^ SE%u SH%u CU%u "
-                               "SIMD%u WAVE%u  EXEC=%016"PRIx64 "  ",
-                               waves->se, waves->sh, waves->cu, waves->simd,
-                               waves->wave, waves->exec);
-
-                       if (inst->size == 4) {
-                               fprintf(f, "INST32=%08X" COLOR_RESET "\n",
-                                       waves->inst_dw0);
-                       } else {
-                               fprintf(f, "INST64=%08X %08X" COLOR_RESET "\n",
-                                       waves->inst_dw0, waves->inst_dw1);
-                       }
-
-                       waves->matched = true;
-                       waves = &waves[1];
-                       num_waves--;
-               }
-       }
-
-       fprintf(f, "\n\n");
-       free(instructions);
-       for (unsigned i = 0; i < ARRAY_SIZE(rtld_binaries); ++i)
-               ac_rtld_close(&rtld_binaries[i]);
+   if (!shader)
+      return;
+
+   struct si_screen *screen = shader->selector->screen;
+   enum pipe_shader_type shader_type = shader->selector->type;
+   uint64_t start_addr = shader->bo->gpu_address;
+   uint64_t end_addr = start_addr + shader->bo->b.b.width0;
+   unsigned i;
+
+   /* See if any wave executes the shader. */
+   for (i = 0; i < num_waves; i++) {
+      if (start_addr <= waves[i].pc && waves[i].pc <= end_addr)
+         break;
+   }
+   if (i == num_waves)
+      return; /* the shader is not being executed */
+
+   /* Remember the first found wave. The waves are sorted according to PC. */
+   waves = &waves[i];
+   num_waves -= i;
+
+   /* Get the list of instructions.
+    * Buffer size / 4 is the upper bound of the instruction count.
+    */
+   unsigned num_inst = 0;
+   uint64_t inst_addr = start_addr;
+   unsigned wave_size = si_get_shader_wave_size(shader);
+   struct ac_rtld_binary rtld_binaries[5] = {};
+   struct si_shader_inst *instructions =
+      calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst));
+
+   if (shader->prolog) {
+      si_add_split_disasm(screen, &rtld_binaries[0], &shader->prolog->binary, &inst_addr, &num_inst,
+                          instructions, shader_type, wave_size);
+   }
+   if (shader->previous_stage) {
+      si_add_split_disasm(screen, &rtld_binaries[1], &shader->previous_stage->binary, &inst_addr,
+                          &num_inst, instructions, shader_type, wave_size);
+   }
+   if (shader->prolog2) {
+      si_add_split_disasm(screen, &rtld_binaries[2], &shader->prolog2->binary, &inst_addr,
+                          &num_inst, instructions, shader_type, wave_size);
+   }
+   si_add_split_disasm(screen, &rtld_binaries[3], &shader->binary, &inst_addr, &num_inst,
+                       instructions, shader_type, wave_size);
+   if (shader->epilog) {
+      si_add_split_disasm(screen, &rtld_binaries[4], &shader->epilog->binary, &inst_addr, &num_inst,
+                          instructions, shader_type, wave_size);
+   }
+
+   fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n",
+           si_get_shader_name(shader));
+
+   /* Print instructions with annotations. */
+   for (i = 0; i < num_inst; i++) {
+      struct si_shader_inst *inst = &instructions[i];
+
+      fprintf(f, "%.*s [PC=0x%" PRIx64 ", size=%u]\n", inst->textlen, inst->text, inst->addr,
+              inst->size);
+
+      /* Print which waves execute the instruction right now. */
+      while (num_waves && inst->addr == waves->pc) {
+         fprintf(f,
+                 "          " COLOR_GREEN "^ SE%u SH%u CU%u "
+                 "SIMD%u WAVE%u  EXEC=%016" PRIx64 "  ",
+                 waves->se, waves->sh, waves->cu, waves->simd, waves->wave, waves->exec);
+
+         if (inst->size == 4) {
+            fprintf(f, "INST32=%08X" COLOR_RESET "\n", waves->inst_dw0);
+         } else {
+            fprintf(f, "INST64=%08X %08X" COLOR_RESET "\n", waves->inst_dw0, waves->inst_dw1);
+         }
+
+         waves->matched = true;
+         waves = &waves[1];
+         num_waves--;
+      }
+   }
+
+   fprintf(f, "\n\n");
+   free(instructions);
+   for (unsigned i = 0; i < ARRAY_SIZE(rtld_binaries); ++i)
+      ac_rtld_close(&rtld_binaries[i]);
  }
  
  static void si_dump_annotated_shaders(struct si_context *sctx, FILE *f)
  {
-       struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP];
-       unsigned num_waves = ac_get_wave_info(sctx->chip_class, waves);
-
-       fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET
-               "\n\n", num_waves);
-
-       si_print_annotated_shader(sctx->vs_shader.current, waves, num_waves, f);
-       si_print_annotated_shader(sctx->tcs_shader.current, waves, num_waves, f);
-       si_print_annotated_shader(sctx->tes_shader.current, waves, num_waves, f);
-       si_print_annotated_shader(sctx->gs_shader.current, waves, num_waves, f);
-       si_print_annotated_shader(sctx->ps_shader.current, waves, num_waves, f);
-
-       /* Print waves executing shaders that are not currently bound. */
-       unsigned i;
-       bool found = false;
-       for (i = 0; i < num_waves; i++) {
-               if (waves[i].matched)
-                       continue;
-
-               if (!found) {
-                       fprintf(f, COLOR_CYAN
-                               "Waves not executing currently-bound shaders:"
-                               COLOR_RESET "\n");
-                       found = true;
-               }
-               fprintf(f, "    SE%u SH%u CU%u SIMD%u WAVE%u  EXEC=%016"PRIx64
-                       "  INST=%08X %08X  PC=%"PRIx64"\n",
-                       waves[i].se, waves[i].sh, waves[i].cu, waves[i].simd,
-                       waves[i].wave, waves[i].exec, waves[i].inst_dw0,
-                       waves[i].inst_dw1, waves[i].pc);
-       }
-       if (found)
-               fprintf(f, "\n\n");
+   struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP];
+   unsigned num_waves = ac_get_wave_info(sctx->chip_class, waves);
+
+   fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET "\n\n", num_waves);
+
+   si_print_annotated_shader(sctx->vs_shader.current, waves, num_waves, f);
+   si_print_annotated_shader(sctx->tcs_shader.current, waves, num_waves, f);
+   si_print_annotated_shader(sctx->tes_shader.current, waves, num_waves, f);
+   si_print_annotated_shader(sctx->gs_shader.current, waves, num_waves, f);
+   si_print_annotated_shader(sctx->ps_shader.current, waves, num_waves, f);
+
+   /* Print waves executing shaders that are not currently bound. */
+   unsigned i;
+   bool found = false;
+   for (i = 0; i < num_waves; i++) {
+      if (waves[i].matched)
+         continue;
+
+      if (!found) {
+         fprintf(f, COLOR_CYAN "Waves not executing currently-bound shaders:" COLOR_RESET "\n");
+         found = true;
+      }
+      fprintf(f,
+              "    SE%u SH%u CU%u SIMD%u WAVE%u  EXEC=%016" PRIx64 "  INST=%08X %08X  PC=%" PRIx64
+              "\n",
+              waves[i].se, waves[i].sh, waves[i].cu, waves[i].simd, waves[i].wave, waves[i].exec,
+              waves[i].inst_dw0, waves[i].inst_dw1, waves[i].pc);
+   }
+   if (found)
+      fprintf(f, "\n\n");
  }
  
  static void si_dump_command(const char *title, const char *command, FILE *f)
  {
-       char line[2000];
+   char line[2000];
  
-       FILE *p = popen(command, "r");
-       if (!p)
-               return;
+   FILE *p = popen(command, "r");
+   if (!p)
+      return;
  
-       fprintf(f, COLOR_YELLOW "%s: " COLOR_RESET "\n", title);
-       while (fgets(line, sizeof(line), p))
-               fputs(line, f);
-       fprintf(f, "\n\n");
-       pclose(p);
+   fprintf(f, COLOR_YELLOW "%s: " COLOR_RESET "\n", title);
+   while (fgets(line, sizeof(line), p))
+      fputs(line, f);
+   fprintf(f, "\n\n");
+   pclose(p);
  }
  
-static void si_dump_debug_state(struct pipe_context *ctx, FILE *f,
-                               unsigned flags)
+static void si_dump_debug_state(struct pipe_context *ctx, FILE *f, unsigned flags)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
  
-       if (sctx->log)
-               u_log_flush(sctx->log);
+   if (sctx->log)
+      u_log_flush(sctx->log);
  
-       if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) {
-               si_dump_debug_registers(sctx, f);
+   if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) {
+      si_dump_debug_registers(sctx, f);
  
-               si_dump_annotated_shaders(sctx, f);
-               si_dump_command("Active waves (raw data)", "umr -O halt_waves -wa | column -t", f);
-               si_dump_command("Wave information", "umr -O halt_waves,bits -wa", f);
-       }
+      si_dump_annotated_shaders(sctx, f);
+      si_dump_command("Active waves (raw data)", "umr -O halt_waves -wa | column -t", f);
+      si_dump_command("Wave information", "umr -O halt_waves,bits -wa", f);
+   }
  }
  
  void si_log_draw_state(struct si_context *sctx, struct u_log_context *log)
  {
-       struct si_shader_ctx_state *tcs_shader;
-
-       if (!log)
-               return;
-
-       tcs_shader = &sctx->tcs_shader;
-       if (sctx->tes_shader.cso && !sctx->tcs_shader.cso)
-               tcs_shader = &sctx->fixed_func_tcs_shader;
-
-       si_dump_framebuffer(sctx, log);
-
-       si_dump_gfx_shader(sctx, &sctx->vs_shader, log);
-       si_dump_gfx_shader(sctx, tcs_shader, log);
-       si_dump_gfx_shader(sctx, &sctx->tes_shader, log);
-       si_dump_gfx_shader(sctx, &sctx->gs_shader, log);
-       si_dump_gfx_shader(sctx, &sctx->ps_shader, log);
-
-       si_dump_descriptor_list(sctx->screen,
-                               &sctx->descriptors[SI_DESCS_RW_BUFFERS],
-                               "", "RW buffers", 4,
-                               sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots,
-                               si_identity, log);
-       si_dump_gfx_descriptors(sctx, &sctx->vs_shader, log);
-       si_dump_gfx_descriptors(sctx, tcs_shader, log);
-       si_dump_gfx_descriptors(sctx, &sctx->tes_shader, log);
-       si_dump_gfx_descriptors(sctx, &sctx->gs_shader, log);
-       si_dump_gfx_descriptors(sctx, &sctx->ps_shader, log);
+   struct si_shader_ctx_state *tcs_shader;
+
+   if (!log)
+      return;
+
+   tcs_shader = &sctx->tcs_shader;
+   if (sctx->tes_shader.cso && !sctx->tcs_shader.cso)
+      tcs_shader = &sctx->fixed_func_tcs_shader;
+
+   si_dump_framebuffer(sctx, log);
+
+   si_dump_gfx_shader(sctx, &sctx->vs_shader, log);
+   si_dump_gfx_shader(sctx, tcs_shader, log);
+   si_dump_gfx_shader(sctx, &sctx->tes_shader, log);
+   si_dump_gfx_shader(sctx, &sctx->gs_shader, log);
+   si_dump_gfx_shader(sctx, &sctx->ps_shader, log);
+
+   si_dump_descriptor_list(sctx->screen, &sctx->descriptors[SI_DESCS_RW_BUFFERS], "", "RW buffers",
+                           4, sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots, si_identity,
+                           log);
+   si_dump_gfx_descriptors(sctx, &sctx->vs_shader, log);
+   si_dump_gfx_descriptors(sctx, tcs_shader, log);
+   si_dump_gfx_descriptors(sctx, &sctx->tes_shader, log);
+   si_dump_gfx_descriptors(sctx, &sctx->gs_shader, log);
+   si_dump_gfx_descriptors(sctx, &sctx->ps_shader, log);
  }
  
  void si_log_compute_state(struct si_context *sctx, struct u_log_context *log)
  {
-       if (!log)
-               return;
+   if (!log)
+      return;
  
-       si_dump_compute_shader(sctx, log);
-       si_dump_compute_descriptors(sctx, log);
+   si_dump_compute_shader(sctx, log);
+   si_dump_compute_descriptors(sctx, log);
  }
  
-static void si_dump_dma(struct si_context *sctx,
-                       struct radeon_saved_cs *saved, FILE *f)
+static void si_dump_dma(struct si_context *sctx, struct radeon_saved_cs *saved, FILE *f)
  {
-       static const char ib_name[] = "sDMA IB";
-       unsigned i;
+   static const char ib_name[] = "sDMA IB";
+   unsigned i;
  
-       si_dump_bo_list(sctx, saved, f);
+   si_dump_bo_list(sctx, saved, f);
  
-       fprintf(f, "------------------ %s begin ------------------\n", ib_name);
+   fprintf(f, "------------------ %s begin ------------------\n", ib_name);
  
-       for (i = 0; i < saved->num_dw; ++i) {
-               fprintf(f, " %08x\n", saved->ib[i]);
-       }
+   for (i = 0; i < saved->num_dw; ++i) {
+      fprintf(f, " %08x\n", saved->ib[i]);
+   }
  
-       fprintf(f, "------------------- %s end -------------------\n", ib_name);
-       fprintf(f, "\n");
+   fprintf(f, "------------------- %s end -------------------\n", ib_name);
+   fprintf(f, "\n");
  
-       fprintf(f, "SDMA Dump Done.\n");
+   fprintf(f, "SDMA Dump Done.\n");
  }
  
-void si_check_vm_faults(struct si_context *sctx,
-                       struct radeon_saved_cs *saved, enum ring_type ring)
+void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, enum ring_type ring)
  {
-       struct pipe_screen *screen = sctx->b.screen;
-       FILE *f;
-       uint64_t addr;
-       char cmd_line[4096];
-
-       if (!ac_vm_fault_occured(sctx->chip_class,
-                                &sctx->dmesg_timestamp, &addr))
-               return;
-
-       f = dd_get_debug_file(false);
-       if (!f)
-               return;
-
-       fprintf(f, "VM fault report.\n\n");
-       if (os_get_command_line(cmd_line, sizeof(cmd_line)))
-               fprintf(f, "Command: %s\n", cmd_line);
-       fprintf(f, "Driver vendor: %s\n", screen->get_vendor(screen));
-       fprintf(f, "Device vendor: %s\n", screen->get_device_vendor(screen));
-       fprintf(f, "Device name: %s\n\n", screen->get_name(screen));
-       fprintf(f, "Failing VM page: 0x%08"PRIx64"\n\n", addr);
-
-       if (sctx->apitrace_call_number)
-               fprintf(f, "Last apitrace call: %u\n\n",
-                       sctx->apitrace_call_number);
-
-       switch (ring) {
-       case RING_GFX: {
-               struct u_log_context log;
-               u_log_context_init(&log);
-
-               si_log_draw_state(sctx, &log);
-               si_log_compute_state(sctx, &log);
-               si_log_cs(sctx, &log, true);
-
-               u_log_new_page_print(&log, f);
-               u_log_context_destroy(&log);
-               break;
-       }
-       case RING_DMA:
-               si_dump_dma(sctx, saved, f);
-               break;
-
-       default:
-               break;
-       }
-
-       fclose(f);
-
-       fprintf(stderr, "Detected a VM fault, exiting...\n");
-       exit(0);
+   struct pipe_screen *screen = sctx->b.screen;
+   FILE *f;
+   uint64_t addr;
+   char cmd_line[4096];
+
+   if (!ac_vm_fault_occured(sctx->chip_class, &sctx->dmesg_timestamp, &addr))
+      return;
+
+   f = dd_get_debug_file(false);
+   if (!f)
+      return;
+
+   fprintf(f, "VM fault report.\n\n");
+   if (os_get_command_line(cmd_line, sizeof(cmd_line)))
+      fprintf(f, "Command: %s\n", cmd_line);
+   fprintf(f, "Driver vendor: %s\n", screen->get_vendor(screen));
+   fprintf(f, "Device vendor: %s\n", screen->get_device_vendor(screen));
+   fprintf(f, "Device name: %s\n\n", screen->get_name(screen));
+   fprintf(f, "Failing VM page: 0x%08" PRIx64 "\n\n", addr);
+
+   if (sctx->apitrace_call_number)
+      fprintf(f, "Last apitrace call: %u\n\n", sctx->apitrace_call_number);
+
+   switch (ring) {
+   case RING_GFX: {
+      struct u_log_context log;
+      u_log_context_init(&log);
+
+      si_log_draw_state(sctx, &log);
+      si_log_compute_state(sctx, &log);
+      si_log_cs(sctx, &log, true);
+
+      u_log_new_page_print(&log, f);
+      u_log_context_destroy(&log);
+      break;
+   }
+   case RING_DMA:
+      si_dump_dma(sctx, saved, f);
+      break;
+
+   default:
+      break;
+   }
+
+   fclose(f);
+
+   fprintf(stderr, "Detected a VM fault, exiting...\n");
+   exit(0);
  }
  
  void si_init_debug_functions(struct si_context *sctx)
  {
-       sctx->b.dump_debug_state = si_dump_debug_state;
-
-       /* Set the initial dmesg timestamp for this context, so that
-        * only new messages will be checked for VM faults.
-        */
-       if (sctx->screen->debug_flags & DBG(CHECK_VM))
-               ac_vm_fault_occured(sctx->chip_class,
-                                   &sctx->dmesg_timestamp, NULL);
+   sctx->b.dump_debug_state = si_dump_debug_state;
+
+   /* Set the initial dmesg timestamp for this context, so that
+    * only new messages will be checked for VM faults.
+    */
+   if (sctx->screen->debug_flags & DBG(CHECK_VM))
+      ac_vm_fault_occured(sctx->chip_class, &sctx->dmesg_timestamp, NULL);
  }
diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h

index b0e8db8646a982299bb6fa8348e4cd2ae76c884b..83c7425e09466e86f2fd50f0bba8618bd3d1fa75 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_debug_options.h
+++ b/src/gallium/drivers/radeonsi/si_debug_options.h
@@ -1,9 +1,11 @@
  OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context")
  OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause stalls)")
  OPT_BOOL(dump_shader_binary, false, "Dump shader binary as part of ddebug_dumps")
-OPT_BOOL(debug_disassembly, false, "Report shader disassembly as part of driver debug messages (for shader db)")
+OPT_BOOL(debug_disassembly, false,
+         "Report shader disassembly as part of driver debug messages (for shader db)")
  OPT_BOOL(halt_shaders, false, "Halt shaders at the start (will hang)")
-OPT_BOOL(vs_fetch_always_opencode, false, "Always open code vertex fetches (less efficient, purely for testing)")
+OPT_BOOL(vs_fetch_always_opencode, false,
+         "Always open code vertex fetches (less efficient, purely for testing)")
  OPT_BOOL(prim_restart_tri_strips_only, false, "Only enable primitive restart for triangle strips")
  
  #undef OPT_BOOL
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c

index fa2174bac5d084a412dd0d07f974c0d0da412c29..bf3ede49b3971fbe3701286ff8dfb0b0314e7d61 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -55,14 +55,12 @@
  
  #include "si_pipe.h"
  #include "sid.h"
-
+#include "util/format/u_format.h"
  #include "util/hash_table.h"
  #include "util/u_idalloc.h"
-#include "util/format/u_format.h"
  #include "util/u_memory.h"
  #include "util/u_upload_mgr.h"
  
-
  /* NULL image and buffer descriptor for textures (alpha = 1) and images
   * (alpha = 0).
   *
@@ -75,221 +73,197 @@
   * This is the only reason why the buffer descriptor must be in words [4:7].
   */
  static uint32_t null_texture_descriptor[8] = {
-       0,
-       0,
-       0,
-       S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) |
-       S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
-       /* the rest must contain zeros, which is also used by the buffer
-        * descriptor */
+   0, 0, 0, S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) | S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
+   /* the rest must contain zeros, which is also used by the buffer
+    * descriptor */
  };
  
  static uint32_t null_image_descriptor[8] = {
-       0,
-       0,
-       0,
-       S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
-       /* the rest must contain zeros, which is also used by the buffer
-        * descriptor */
+   0, 0, 0, S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
+   /* the rest must contain zeros, which is also used by the buffer
+    * descriptor */
  };
  
  static uint64_t si_desc_extract_buffer_address(const uint32_t *desc)
  {
-       uint64_t va = desc[0] |
-                     ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
+   uint64_t va = desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
  
-       /* Sign-extend the 48-bit address. */
-       va <<= 16;
-       va = (int64_t)va >> 16;
-       return va;
+   /* Sign-extend the 48-bit address. */
+   va <<= 16;
+   va = (int64_t)va >> 16;
+   return va;
  }
  
-static void si_init_descriptor_list(uint32_t *desc_list,
-                                   unsigned element_dw_size,
-                                   unsigned num_elements,
-                                   const uint32_t *null_descriptor)
+static void si_init_descriptor_list(uint32_t *desc_list, unsigned element_dw_size,
+                                    unsigned num_elements, const uint32_t *null_descriptor)
  {
-       int i;
+   int i;
  
-       /* Initialize the array to NULL descriptors if the element size is 8. */
-       if (null_descriptor) {
-               assert(element_dw_size % 8 == 0);
-               for (i = 0; i < num_elements * element_dw_size / 8; i++)
-                       memcpy(desc_list + i * 8, null_descriptor, 8 * 4);
-       }
+   /* Initialize the array to NULL descriptors if the element size is 8. */
+   if (null_descriptor) {
+      assert(element_dw_size % 8 == 0);
+      for (i = 0; i < num_elements * element_dw_size / 8; i++)
+         memcpy(desc_list + i * 8, null_descriptor, 8 * 4);
+   }
  }
  
-static void si_init_descriptors(struct si_descriptors *desc,
-                               short shader_userdata_rel_index,
-                               unsigned element_dw_size,
-                               unsigned num_elements)
+static void si_init_descriptors(struct si_descriptors *desc, short shader_userdata_rel_index,
+                                unsigned element_dw_size, unsigned num_elements)
  {
-       desc->list = CALLOC(num_elements, element_dw_size * 4);
-       desc->element_dw_size = element_dw_size;
-       desc->num_elements = num_elements;
-       desc->shader_userdata_offset = shader_userdata_rel_index * 4;
-       desc->slot_index_to_bind_directly = -1;
+   desc->list = CALLOC(num_elements, element_dw_size * 4);
+   desc->element_dw_size = element_dw_size;
+   desc->num_elements = num_elements;
+   desc->shader_userdata_offset = shader_userdata_rel_index * 4;
+   desc->slot_index_to_bind_directly = -1;
  }
  
  static void si_release_descriptors(struct si_descriptors *desc)
  {
-       si_resource_reference(&desc->buffer, NULL);
-       FREE(desc->list);
+   si_resource_reference(&desc->buffer, NULL);
+   FREE(desc->list);
  }
  
-static bool si_upload_descriptors(struct si_context *sctx,
-                                 struct si_descriptors *desc)
+static bool si_upload_descriptors(struct si_context *sctx, struct si_descriptors *desc)
  {
-       unsigned slot_size = desc->element_dw_size * 4;
-       unsigned first_slot_offset = desc->first_active_slot * slot_size;
-       unsigned upload_size = desc->num_active_slots * slot_size;
+   unsigned slot_size = desc->element_dw_size * 4;
+   unsigned first_slot_offset = desc->first_active_slot * slot_size;
+   unsigned upload_size = desc->num_active_slots * slot_size;
  
-       /* Skip the upload if no shader is using the descriptors. dirty_mask
-        * will stay dirty and the descriptors will be uploaded when there is
-        * a shader using them.
-        */
-       if (!upload_size)
-               return true;
+   /* Skip the upload if no shader is using the descriptors. dirty_mask
+    * will stay dirty and the descriptors will be uploaded when there is
+    * a shader using them.
+    */
+   if (!upload_size)
+      return true;
  
-       /* If there is just one active descriptor, bind it directly. */
-       if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly &&
-           desc->num_active_slots == 1) {
-               uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly *
-                                                  desc->element_dw_size];
+   /* If there is just one active descriptor, bind it directly. */
+   if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly &&
+       desc->num_active_slots == 1) {
+      uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly * desc->element_dw_size];
  
-               /* The buffer is already in the buffer list. */
-               si_resource_reference(&desc->buffer, NULL);
-               desc->gpu_list = NULL;
-               desc->gpu_address = si_desc_extract_buffer_address(descriptor);
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
-               return true;
-       }
+      /* The buffer is already in the buffer list. */
+      si_resource_reference(&desc->buffer, NULL);
+      desc->gpu_list = NULL;
+      desc->gpu_address = si_desc_extract_buffer_address(descriptor);
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+      return true;
+   }
  
-       uint32_t *ptr;
-       unsigned buffer_offset;
-       u_upload_alloc(sctx->b.const_uploader, first_slot_offset, upload_size,
-                      si_optimal_tcc_alignment(sctx, upload_size),
-                      &buffer_offset, (struct pipe_resource**)&desc->buffer,
-                      (void**)&ptr);
-       if (!desc->buffer) {
-               desc->gpu_address = 0;
-               return false; /* skip the draw call */
-       }
+   uint32_t *ptr;
+   unsigned buffer_offset;
+   u_upload_alloc(sctx->b.const_uploader, first_slot_offset, upload_size,
+                  si_optimal_tcc_alignment(sctx, upload_size), &buffer_offset,
+                  (struct pipe_resource **)&desc->buffer, (void **)&ptr);
+   if (!desc->buffer) {
+      desc->gpu_address = 0;
+      return false; /* skip the draw call */
+   }
  
-       util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset,
-                               upload_size);
-       desc->gpu_list = ptr - first_slot_offset / 4;
+   util_memcpy_cpu_to_le32(ptr, (char *)desc->list + first_slot_offset, upload_size);
+   desc->gpu_list = ptr - first_slot_offset / 4;
  
-       radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer,
-                            RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer, RADEON_USAGE_READ,
+                             RADEON_PRIO_DESCRIPTORS);
  
-       /* The shader pointer should point to slot 0. */
-       buffer_offset -= first_slot_offset;
-       desc->gpu_address = desc->buffer->gpu_address + buffer_offset;
+   /* The shader pointer should point to slot 0. */
+   buffer_offset -= first_slot_offset;
+   desc->gpu_address = desc->buffer->gpu_address + buffer_offset;
  
-       assert(desc->buffer->flags & RADEON_FLAG_32BIT);
-       assert((desc->buffer->gpu_address >> 32) == sctx->screen->info.address32_hi);
-       assert((desc->gpu_address >> 32) == sctx->screen->info.address32_hi);
+   assert(desc->buffer->flags & RADEON_FLAG_32BIT);
+   assert((desc->buffer->gpu_address >> 32) == sctx->screen->info.address32_hi);
+   assert((desc->gpu_address >> 32) == sctx->screen->info.address32_hi);
  
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
-       return true;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+   return true;
  }
  
-static void
-si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
+static void si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
  {
-       if (!desc->buffer)
-               return;
+   if (!desc->buffer)
+      return;
  
-       radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer,
-                                 RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer, RADEON_USAGE_READ,
+                             RADEON_PRIO_DESCRIPTORS);
  }
  
  /* SAMPLER VIEWS */
  
-static inline enum radeon_bo_priority
-si_get_sampler_view_priority(struct si_resource *res)
+static inline enum radeon_bo_priority si_get_sampler_view_priority(struct si_resource *res)
  {
-       if (res->b.b.target == PIPE_BUFFER)
-               return RADEON_PRIO_SAMPLER_BUFFER;
+   if (res->b.b.target == PIPE_BUFFER)
+      return RADEON_PRIO_SAMPLER_BUFFER;
  
-       if (res->b.b.nr_samples > 1)
-               return RADEON_PRIO_SAMPLER_TEXTURE_MSAA;
+   if (res->b.b.nr_samples > 1)
+      return RADEON_PRIO_SAMPLER_TEXTURE_MSAA;
  
-       return RADEON_PRIO_SAMPLER_TEXTURE;
+   return RADEON_PRIO_SAMPLER_TEXTURE;
  }
  
-static struct si_descriptors *
-si_sampler_and_image_descriptors(struct si_context *sctx, unsigned shader)
+static struct si_descriptors *si_sampler_and_image_descriptors(struct si_context *sctx,
+                                                               unsigned shader)
  {
-       return &sctx->descriptors[si_sampler_and_image_descriptors_idx(shader)];
+   return &sctx->descriptors[si_sampler_and_image_descriptors_idx(shader)];
  }
  
  static void si_release_sampler_views(struct si_samplers *samplers)
  {
-       int i;
+   int i;
  
-       for (i = 0; i < ARRAY_SIZE(samplers->views); i++) {
-               pipe_sampler_view_reference(&samplers->views[i], NULL);
-       }
+   for (i = 0; i < ARRAY_SIZE(samplers->views); i++) {
+      pipe_sampler_view_reference(&samplers->views[i], NULL);
+   }
  }
  
-static void si_sampler_view_add_buffer(struct si_context *sctx,
-                                      struct pipe_resource *resource,
-                                      enum radeon_bo_usage usage,
-                                      bool is_stencil_sampler,
-                                      bool check_mem)
+static void si_sampler_view_add_buffer(struct si_context *sctx, struct pipe_resource *resource,
+                                       enum radeon_bo_usage usage, bool is_stencil_sampler,
+                                       bool check_mem)
  {
-       struct si_texture *tex = (struct si_texture*)resource;
-       enum radeon_bo_priority priority;
+   struct si_texture *tex = (struct si_texture *)resource;
+   enum radeon_bo_priority priority;
  
-       if (!resource)
-               return;
+   if (!resource)
+      return;
  
-       /* Use the flushed depth texture if direct sampling is unsupported. */
-       if (resource->target != PIPE_BUFFER &&
-           tex->is_depth && !si_can_sample_zs(tex, is_stencil_sampler))
-               tex = tex->flushed_depth_texture;
+   /* Use the flushed depth texture if direct sampling is unsupported. */
+   if (resource->target != PIPE_BUFFER && tex->is_depth &&
+       !si_can_sample_zs(tex, is_stencil_sampler))
+      tex = tex->flushed_depth_texture;
  
-       priority = si_get_sampler_view_priority(&tex->buffer);
-       radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage, priority,
-                                               check_mem);
+   priority = si_get_sampler_view_priority(&tex->buffer);
+   radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage, priority, check_mem);
  
-       if (resource->target == PIPE_BUFFER)
-               return;
+   if (resource->target == PIPE_BUFFER)
+      return;
  
-       /* Add separate DCC. */
-       if (tex->dcc_separate_buffer) {
-               radeon_add_to_gfx_buffer_list_check_mem(sctx, tex->dcc_separate_buffer,
-                                                       usage, RADEON_PRIO_SEPARATE_META, check_mem);
-       }
+   /* Add separate DCC. */
+   if (tex->dcc_separate_buffer) {
+      radeon_add_to_gfx_buffer_list_check_mem(sctx, tex->dcc_separate_buffer, usage,
+                                              RADEON_PRIO_SEPARATE_META, check_mem);
+   }
  }
  
-static void si_sampler_views_begin_new_cs(struct si_context *sctx,
-                                         struct si_samplers *samplers)
+static void si_sampler_views_begin_new_cs(struct si_context *sctx, struct si_samplers *samplers)
  {
-       unsigned mask = samplers->enabled_mask;
+   unsigned mask = samplers->enabled_mask;
  
-       /* Add buffers to the CS. */
-       while (mask) {
-               int i = u_bit_scan(&mask);
-               struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[i];
+   /* Add buffers to the CS. */
+   while (mask) {
+      int i = u_bit_scan(&mask);
+      struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[i];
  
-               si_sampler_view_add_buffer(sctx, sview->base.texture,
-                                          RADEON_USAGE_READ,
-                                          sview->is_stencil_sampler, false);
-       }
+      si_sampler_view_add_buffer(sctx, sview->base.texture, RADEON_USAGE_READ,
+                                 sview->is_stencil_sampler, false);
+   }
  }
  
  /* Set buffer descriptor fields that can be changed by reallocations. */
-static void si_set_buf_desc_address(struct si_resource *buf,
-                                   uint64_t offset, uint32_t *state)
+static void si_set_buf_desc_address(struct si_resource *buf, uint64_t offset, uint32_t *state)
  {
-       uint64_t va = buf->gpu_address + offset;
+   uint64_t va = buf->gpu_address + offset;
  
-       state[0] = va;
-       state[1] &= C_008F04_BASE_ADDRESS_HI;
-       state[1] |= S_008F04_BASE_ADDRESS_HI(va >> 32);
+   state[0] = va;
+   state[1] &= C_008F04_BASE_ADDRESS_HI;
+   state[1] |= S_008F04_BASE_ADDRESS_HI(va >> 32);
  }
  
  /* Set texture descriptor fields that can be changed by reallocations.
@@ -302,1316 +276,1195 @@ static void si_set_buf_desc_address(struct si_resource *buf,
   * \param is_stencil           select between separate Z & Stencil
   * \param state                        descriptor to update
   */
-void si_set_mutable_tex_desc_fields(struct si_screen *sscreen,
-                                   struct si_texture *tex,
-                                   const struct legacy_surf_level *base_level_info,
-                                   unsigned base_level, unsigned first_level,
-                                   unsigned block_width, bool is_stencil,
-                                   uint32_t *state)
-{
-       uint64_t va, meta_va = 0;
-
-       if (tex->is_depth && !si_can_sample_zs(tex, is_stencil)) {
-               tex = tex->flushed_depth_texture;
-               is_stencil = false;
-       }
-
-       va = tex->buffer.gpu_address;
-
-       if (sscreen->info.chip_class >= GFX9) {
-               /* Only stencil_offset needs to be added here. */
-               if (is_stencil)
-                       va += tex->surface.u.gfx9.stencil_offset;
-               else
-                       va += tex->surface.u.gfx9.surf_offset;
-       } else {
-               va += base_level_info->offset;
-       }
-
-       state[0] = va >> 8;
-       state[1] &= C_008F14_BASE_ADDRESS_HI;
-       state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
-
-       /* Only macrotiled modes can set tile swizzle.
-        * GFX9 doesn't use (legacy) base_level_info.
-        */
-       if (sscreen->info.chip_class >= GFX9 ||
-           base_level_info->mode == RADEON_SURF_MODE_2D)
-               state[0] |= tex->surface.tile_swizzle;
-
-       if (sscreen->info.chip_class >= GFX8) {
-               state[6] &= C_008F28_COMPRESSION_EN;
-
-               if (vi_dcc_enabled(tex, first_level)) {
-                       meta_va = (!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) +
-                                 tex->surface.dcc_offset;
-
-                       if (sscreen->info.chip_class == GFX8) {
-                               meta_va += base_level_info->dcc_offset;
-                               assert(base_level_info->mode == RADEON_SURF_MODE_2D);
-                       }
-
-                       unsigned dcc_tile_swizzle = tex->surface.tile_swizzle << 8;
-                       dcc_tile_swizzle &= tex->surface.dcc_alignment - 1;
-                       meta_va |= dcc_tile_swizzle;
-               } else if (vi_tc_compat_htile_enabled(tex, first_level,
-                                                     is_stencil ? PIPE_MASK_S : PIPE_MASK_Z)) {
-                       meta_va = tex->buffer.gpu_address + tex->surface.htile_offset;
-               }
-
-               if (meta_va)
-                       state[6] |= S_008F28_COMPRESSION_EN(1);
-       }
-
-       if (sscreen->info.chip_class >= GFX8 && sscreen->info.chip_class <= GFX9)
-               state[7] = meta_va >> 8;
-
-       if (sscreen->info.chip_class >= GFX10) {
-               state[3] &= C_00A00C_SW_MODE;
-
-               if (is_stencil) {
-                       state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
-               } else {
-                       state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode);
-               }
-
-               state[6] &= C_00A018_META_DATA_ADDRESS_LO &
-                           C_00A018_META_PIPE_ALIGNED;
-
-               if (meta_va) {
-                       struct gfx9_surf_meta_flags meta;
-
-                       if (tex->surface.dcc_offset)
-                               meta = tex->surface.u.gfx9.dcc;
-                       else
-                               meta = tex->surface.u.gfx9.htile;
-
-                       state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) |
-                                   S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8);
-               }
-
-               state[7] = meta_va >> 16;
-       } else if (sscreen->info.chip_class == GFX9) {
-               state[3] &= C_008F1C_SW_MODE;
-               state[4] &= C_008F20_PITCH;
-
-               if (is_stencil) {
-                       state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
-                       state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.stencil.epitch);
-               } else {
-                       state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode);
-                       state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.surf.epitch);
-               }
-
-               state[5] &= C_008F24_META_DATA_ADDRESS &
-                           C_008F24_META_PIPE_ALIGNED &
-                           C_008F24_META_RB_ALIGNED;
-               if (meta_va) {
-                       struct gfx9_surf_meta_flags meta;
-
-                       if (tex->surface.dcc_offset)
-                               meta = tex->surface.u.gfx9.dcc;
-                       else
-                               meta = tex->surface.u.gfx9.htile;
-
-                       state[5] |= S_008F24_META_DATA_ADDRESS(meta_va >> 40) |
-                                   S_008F24_META_PIPE_ALIGNED(meta.pipe_aligned) |
-                                   S_008F24_META_RB_ALIGNED(meta.rb_aligned);
-               }
-       } else {
-               /* GFX6-GFX8 */
-               unsigned pitch = base_level_info->nblk_x * block_width;
-               unsigned index = si_tile_mode_index(tex, base_level, is_stencil);
-
-               state[3] &= C_008F1C_TILING_INDEX;
-               state[3] |= S_008F1C_TILING_INDEX(index);
-               state[4] &= C_008F20_PITCH;
-               state[4] |= S_008F20_PITCH(pitch - 1);
-       }
+void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex,
+                                    const struct legacy_surf_level *base_level_info,
+                                    unsigned base_level, unsigned first_level, unsigned block_width,
+                                    bool is_stencil, uint32_t *state)
+{
+   uint64_t va, meta_va = 0;
+
+   if (tex->is_depth && !si_can_sample_zs(tex, is_stencil)) {
+      tex = tex->flushed_depth_texture;
+      is_stencil = false;
+   }
+
+   va = tex->buffer.gpu_address;
+
+   if (sscreen->info.chip_class >= GFX9) {
+      /* Only stencil_offset needs to be added here. */
+      if (is_stencil)
+         va += tex->surface.u.gfx9.stencil_offset;
+      else
+         va += tex->surface.u.gfx9.surf_offset;
+   } else {
+      va += base_level_info->offset;
+   }
+
+   state[0] = va >> 8;
+   state[1] &= C_008F14_BASE_ADDRESS_HI;
+   state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
+
+   /* Only macrotiled modes can set tile swizzle.
+    * GFX9 doesn't use (legacy) base_level_info.
+    */
+   if (sscreen->info.chip_class >= GFX9 || base_level_info->mode == RADEON_SURF_MODE_2D)
+      state[0] |= tex->surface.tile_swizzle;
+
+   if (sscreen->info.chip_class >= GFX8) {
+      state[6] &= C_008F28_COMPRESSION_EN;
+
+      if (vi_dcc_enabled(tex, first_level)) {
+         meta_va =
+            (!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.dcc_offset;
+
+         if (sscreen->info.chip_class == GFX8) {
+            meta_va += base_level_info->dcc_offset;
+            assert(base_level_info->mode == RADEON_SURF_MODE_2D);
+         }
+
+         unsigned dcc_tile_swizzle = tex->surface.tile_swizzle << 8;
+         dcc_tile_swizzle &= tex->surface.dcc_alignment - 1;
+         meta_va |= dcc_tile_swizzle;
+      } else if (vi_tc_compat_htile_enabled(tex, first_level,
+                                            is_stencil ? PIPE_MASK_S : PIPE_MASK_Z)) {
+         meta_va = tex->buffer.gpu_address + tex->surface.htile_offset;
+      }
+
+      if (meta_va)
+         state[6] |= S_008F28_COMPRESSION_EN(1);
+   }
+
+   if (sscreen->info.chip_class >= GFX8 && sscreen->info.chip_class <= GFX9)
+      state[7] = meta_va >> 8;
+
+   if (sscreen->info.chip_class >= GFX10) {
+      state[3] &= C_00A00C_SW_MODE;
+
+      if (is_stencil) {
+         state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
+      } else {
+         state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode);
+      }
+
+      state[6] &= C_00A018_META_DATA_ADDRESS_LO & C_00A018_META_PIPE_ALIGNED;
+
+      if (meta_va) {
+         struct gfx9_surf_meta_flags meta;
+
+         if (tex->surface.dcc_offset)
+            meta = tex->surface.u.gfx9.dcc;
+         else
+            meta = tex->surface.u.gfx9.htile;
+
+         state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) |
+                     S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8);
+      }
+
+      state[7] = meta_va >> 16;
+   } else if (sscreen->info.chip_class == GFX9) {
+      state[3] &= C_008F1C_SW_MODE;
+      state[4] &= C_008F20_PITCH;
+
+      if (is_stencil) {
+         state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
+         state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.stencil.epitch);
+      } else {
+         state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode);
+         state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.surf.epitch);
+      }
+
+      state[5] &=
+         C_008F24_META_DATA_ADDRESS & C_008F24_META_PIPE_ALIGNED & C_008F24_META_RB_ALIGNED;
+      if (meta_va) {
+         struct gfx9_surf_meta_flags meta;
+
+         if (tex->surface.dcc_offset)
+            meta = tex->surface.u.gfx9.dcc;
+         else
+            meta = tex->surface.u.gfx9.htile;
+
+         state[5] |= S_008F24_META_DATA_ADDRESS(meta_va >> 40) |
+                     S_008F24_META_PIPE_ALIGNED(meta.pipe_aligned) |
+                     S_008F24_META_RB_ALIGNED(meta.rb_aligned);
+      }
+   } else {
+      /* GFX6-GFX8 */
+      unsigned pitch = base_level_info->nblk_x * block_width;
+      unsigned index = si_tile_mode_index(tex, base_level, is_stencil);
+
+      state[3] &= C_008F1C_TILING_INDEX;
+      state[3] |= S_008F1C_TILING_INDEX(index);
+      state[4] &= C_008F20_PITCH;
+      state[4] |= S_008F20_PITCH(pitch - 1);
+   }
  }
  
  static void si_set_sampler_state_desc(struct si_sampler_state *sstate,
-                                     struct si_sampler_view *sview,
-                                     struct si_texture *tex,
-                                     uint32_t *desc)
-{
-       if (sview && sview->is_integer)
-               memcpy(desc, sstate->integer_val, 4*4);
-       else if (tex && tex->upgraded_depth &&
-                (!sview || !sview->is_stencil_sampler))
-               memcpy(desc, sstate->upgraded_depth_val, 4*4);
-       else
-               memcpy(desc, sstate->val, 4*4);
-}
-
-static void si_set_sampler_view_desc(struct si_context *sctx,
-                                    struct si_sampler_view *sview,
-                                    struct si_sampler_state *sstate,
-                                    uint32_t *desc)
-{
-       struct pipe_sampler_view *view = &sview->base;
-       struct si_texture *tex = (struct si_texture *)view->texture;
-       bool is_buffer = tex->buffer.b.b.target == PIPE_BUFFER;
-
-       if (unlikely(!is_buffer && sview->dcc_incompatible)) {
-               if (vi_dcc_enabled(tex, view->u.tex.first_level))
-                       if (!si_texture_disable_dcc(sctx, tex))
-                               si_decompress_dcc(sctx, tex);
-
-               sview->dcc_incompatible = false;
-       }
-
-       assert(tex); /* views with texture == NULL aren't supported */
-       memcpy(desc, sview->state, 8*4);
-
-       if (is_buffer) {
-               si_set_buf_desc_address(&tex->buffer,
-                                       sview->base.u.buf.offset,
-                                       desc + 4);
-       } else {
-               bool is_separate_stencil = tex->db_compatible &&
-                                          sview->is_stencil_sampler;
-
-               si_set_mutable_tex_desc_fields(sctx->screen, tex,
-                                              sview->base_level_info,
-                                              sview->base_level,
-                                              sview->base.u.tex.first_level,
-                                              sview->block_width,
-                                              is_separate_stencil,
-                                              desc);
-       }
-
-       if (!is_buffer && tex->surface.fmask_size) {
-               memcpy(desc + 8, sview->fmask_state, 8*4);
-       } else {
-               /* Disable FMASK and bind sampler state in [12:15]. */
-               memcpy(desc + 8, null_texture_descriptor, 4*4);
-
-               if (sstate)
-                       si_set_sampler_state_desc(sstate, sview,
-                                                 is_buffer ? NULL : tex,
-                                                 desc + 12);
-       }
+                                      struct si_sampler_view *sview, struct si_texture *tex,
+                                      uint32_t *desc)
+{
+   if (sview && sview->is_integer)
+      memcpy(desc, sstate->integer_val, 4 * 4);
+   else if (tex && tex->upgraded_depth && (!sview || !sview->is_stencil_sampler))
+      memcpy(desc, sstate->upgraded_depth_val, 4 * 4);
+   else
+      memcpy(desc, sstate->val, 4 * 4);
+}
+
+static void si_set_sampler_view_desc(struct si_context *sctx, struct si_sampler_view *sview,
+                                     struct si_sampler_state *sstate, uint32_t *desc)
+{
+   struct pipe_sampler_view *view = &sview->base;
+   struct si_texture *tex = (struct si_texture *)view->texture;
+   bool is_buffer = tex->buffer.b.b.target == PIPE_BUFFER;
+
+   if (unlikely(!is_buffer && sview->dcc_incompatible)) {
+      if (vi_dcc_enabled(tex, view->u.tex.first_level))
+         if (!si_texture_disable_dcc(sctx, tex))
+            si_decompress_dcc(sctx, tex);
+
+      sview->dcc_incompatible = false;
+   }
+
+   assert(tex); /* views with texture == NULL aren't supported */
+   memcpy(desc, sview->state, 8 * 4);
+
+   if (is_buffer) {
+      si_set_buf_desc_address(&tex->buffer, sview->base.u.buf.offset, desc + 4);
+   } else {
+      bool is_separate_stencil = tex->db_compatible && sview->is_stencil_sampler;
+
+      si_set_mutable_tex_desc_fields(sctx->screen, tex, sview->base_level_info, sview->base_level,
+                                     sview->base.u.tex.first_level, sview->block_width,
+                                     is_separate_stencil, desc);
+   }
+
+   if (!is_buffer && tex->surface.fmask_size) {
+      memcpy(desc + 8, sview->fmask_state, 8 * 4);
+   } else {
+      /* Disable FMASK and bind sampler state in [12:15]. */
+      memcpy(desc + 8, null_texture_descriptor, 4 * 4);
+
+      if (sstate)
+         si_set_sampler_state_desc(sstate, sview, is_buffer ? NULL : tex, desc + 12);
+   }
  }
  
  static bool color_needs_decompression(struct si_texture *tex)
  {
-       return tex->surface.fmask_size ||
-              (tex->dirty_level_mask &&
-               (tex->cmask_buffer || tex->surface.dcc_offset));
+   return tex->surface.fmask_size ||
+          (tex->dirty_level_mask && (tex->cmask_buffer || tex->surface.dcc_offset));
  }
  
  static bool depth_needs_decompression(struct si_texture *tex)
  {
-       /* If the depth/stencil texture is TC-compatible, no decompression
-        * will be done. The decompression function will only flush DB caches
-        * to make it coherent with shaders. That's necessary because the driver
-        * doesn't flush DB caches in any other case.
-        */
-       return tex->db_compatible;
-}
-
-static void si_set_sampler_view(struct si_context *sctx,
-                               unsigned shader,
-                               unsigned slot, struct pipe_sampler_view *view,
-                               bool disallow_early_out)
-{
-       struct si_samplers *samplers = &sctx->samplers[shader];
-       struct si_sampler_view *sview = (struct si_sampler_view*)view;
-       struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
-       unsigned desc_slot = si_get_sampler_slot(slot);
-       uint32_t *desc = descs->list + desc_slot * 16;
-
-       if (samplers->views[slot] == view && !disallow_early_out)
-               return;
-
-       if (view) {
-               struct si_texture *tex = (struct si_texture *)view->texture;
-
-               si_set_sampler_view_desc(sctx, sview,
-                                        samplers->sampler_states[slot], desc);
-
-               if (tex->buffer.b.b.target == PIPE_BUFFER) {
-                       tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW;
-                       samplers->needs_depth_decompress_mask &= ~(1u << slot);
-                       samplers->needs_color_decompress_mask &= ~(1u << slot);
-               } else {
-                       if (depth_needs_decompression(tex)) {
-                               samplers->needs_depth_decompress_mask |= 1u << slot;
-                       } else {
-                               samplers->needs_depth_decompress_mask &= ~(1u << slot);
-                       }
-                       if (color_needs_decompression(tex)) {
-                               samplers->needs_color_decompress_mask |= 1u << slot;
-                       } else {
-                               samplers->needs_color_decompress_mask &= ~(1u << slot);
-                       }
-
-                       if (tex->surface.dcc_offset &&
-                           p_atomic_read(&tex->framebuffers_bound))
-                               sctx->need_check_render_feedback = true;
-               }
-
-               pipe_sampler_view_reference(&samplers->views[slot], view);
-               samplers->enabled_mask |= 1u << slot;
-
-               /* Since this can flush, it must be done after enabled_mask is
-                * updated. */
-               si_sampler_view_add_buffer(sctx, view->texture,
-                                          RADEON_USAGE_READ,
-                                          sview->is_stencil_sampler, true);
-       } else {
-               pipe_sampler_view_reference(&samplers->views[slot], NULL);
-               memcpy(desc, null_texture_descriptor, 8*4);
-               /* Only clear the lower dwords of FMASK. */
-               memcpy(desc + 8, null_texture_descriptor, 4*4);
-               /* Re-set the sampler state if we are transitioning from FMASK. */
-               if (samplers->sampler_states[slot])
-                       si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL,
-                                                 desc + 12);
-
-               samplers->enabled_mask &= ~(1u << slot);
-               samplers->needs_depth_decompress_mask &= ~(1u << slot);
-               samplers->needs_color_decompress_mask &= ~(1u << slot);
-       }
-
-       sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
-}
-
-static void si_update_shader_needs_decompress_mask(struct si_context *sctx,
-                                                  unsigned shader)
-{
-       struct si_samplers *samplers = &sctx->samplers[shader];
-       unsigned shader_bit = 1 << shader;
-
-       if (samplers->needs_depth_decompress_mask ||
-           samplers->needs_color_decompress_mask ||
-           sctx->images[shader].needs_color_decompress_mask)
-               sctx->shader_needs_decompress_mask |= shader_bit;
-       else
-               sctx->shader_needs_decompress_mask &= ~shader_bit;
-}
-
-static void si_set_sampler_views(struct pipe_context *ctx,
-                                enum pipe_shader_type shader, unsigned start,
-                                 unsigned count,
-                                struct pipe_sampler_view **views)
-{
-       struct si_context *sctx = (struct si_context *)ctx;
-       int i;
-
-       if (!count || shader >= SI_NUM_SHADERS)
-               return;
-
-       if (views) {
-               for (i = 0; i < count; i++)
-                       si_set_sampler_view(sctx, shader, start + i, views[i], false);
-       } else {
-               for (i = 0; i < count; i++)
-                       si_set_sampler_view(sctx, shader, start + i, NULL, false);
-       }
-
-       si_update_shader_needs_decompress_mask(sctx, shader);
-}
-
-static void
-si_samplers_update_needs_color_decompress_mask(struct si_samplers *samplers)
-{
-       unsigned mask = samplers->enabled_mask;
-
-       while (mask) {
-               int i = u_bit_scan(&mask);
-               struct pipe_resource *res = samplers->views[i]->texture;
-
-               if (res && res->target != PIPE_BUFFER) {
-                       struct si_texture *tex = (struct si_texture *)res;
-
-                       if (color_needs_decompression(tex)) {
-                               samplers->needs_color_decompress_mask |= 1u << i;
-                       } else {
-                               samplers->needs_color_decompress_mask &= ~(1u << i);
-                       }
-               }
-       }
+   /* If the depth/stencil texture is TC-compatible, no decompression
+    * will be done. The decompression function will only flush DB caches
+    * to make it coherent with shaders. That's necessary because the driver
+    * doesn't flush DB caches in any other case.
+    */
+   return tex->db_compatible;
+}
+
+static void si_set_sampler_view(struct si_context *sctx, unsigned shader, unsigned slot,
+                                struct pipe_sampler_view *view, bool disallow_early_out)
+{
+   struct si_samplers *samplers = &sctx->samplers[shader];
+   struct si_sampler_view *sview = (struct si_sampler_view *)view;
+   struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
+   unsigned desc_slot = si_get_sampler_slot(slot);
+   uint32_t *desc = descs->list + desc_slot * 16;
+
+   if (samplers->views[slot] == view && !disallow_early_out)
+      return;
+
+   if (view) {
+      struct si_texture *tex = (struct si_texture *)view->texture;
+
+      si_set_sampler_view_desc(sctx, sview, samplers->sampler_states[slot], desc);
+
+      if (tex->buffer.b.b.target == PIPE_BUFFER) {
+         tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW;
+         samplers->needs_depth_decompress_mask &= ~(1u << slot);
+         samplers->needs_color_decompress_mask &= ~(1u << slot);
+      } else {
+         if (depth_needs_decompression(tex)) {
+            samplers->needs_depth_decompress_mask |= 1u << slot;
+         } else {
+            samplers->needs_depth_decompress_mask &= ~(1u << slot);
+         }
+         if (color_needs_decompression(tex)) {
+            samplers->needs_color_decompress_mask |= 1u << slot;
+         } else {
+            samplers->needs_color_decompress_mask &= ~(1u << slot);
+         }
+
+         if (tex->surface.dcc_offset && p_atomic_read(&tex->framebuffers_bound))
+            sctx->need_check_render_feedback = true;
+      }
+
+      pipe_sampler_view_reference(&samplers->views[slot], view);
+      samplers->enabled_mask |= 1u << slot;
+
+      /* Since this can flush, it must be done after enabled_mask is
+       * updated. */
+      si_sampler_view_add_buffer(sctx, view->texture, RADEON_USAGE_READ, sview->is_stencil_sampler,
+                                 true);
+   } else {
+      pipe_sampler_view_reference(&samplers->views[slot], NULL);
+      memcpy(desc, null_texture_descriptor, 8 * 4);
+      /* Only clear the lower dwords of FMASK. */
+      memcpy(desc + 8, null_texture_descriptor, 4 * 4);
+      /* Re-set the sampler state if we are transitioning from FMASK. */
+      if (samplers->sampler_states[slot])
+         si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL, desc + 12);
+
+      samplers->enabled_mask &= ~(1u << slot);
+      samplers->needs_depth_decompress_mask &= ~(1u << slot);
+      samplers->needs_color_decompress_mask &= ~(1u << slot);
+   }
+
+   sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+}
+
+static void si_update_shader_needs_decompress_mask(struct si_context *sctx, unsigned shader)
+{
+   struct si_samplers *samplers = &sctx->samplers[shader];
+   unsigned shader_bit = 1 << shader;
+
+   if (samplers->needs_depth_decompress_mask || samplers->needs_color_decompress_mask ||
+       sctx->images[shader].needs_color_decompress_mask)
+      sctx->shader_needs_decompress_mask |= shader_bit;
+   else
+      sctx->shader_needs_decompress_mask &= ~shader_bit;
+}
+
+static void si_set_sampler_views(struct pipe_context *ctx, enum pipe_shader_type shader,
+                                 unsigned start, unsigned count, struct pipe_sampler_view **views)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   int i;
+
+   if (!count || shader >= SI_NUM_SHADERS)
+      return;
+
+   if (views) {
+      for (i = 0; i < count; i++)
+         si_set_sampler_view(sctx, shader, start + i, views[i], false);
+   } else {
+      for (i = 0; i < count; i++)
+         si_set_sampler_view(sctx, shader, start + i, NULL, false);
+   }
+
+   si_update_shader_needs_decompress_mask(sctx, shader);
+}
+
+static void si_samplers_update_needs_color_decompress_mask(struct si_samplers *samplers)
+{
+   unsigned mask = samplers->enabled_mask;
+
+   while (mask) {
+      int i = u_bit_scan(&mask);
+      struct pipe_resource *res = samplers->views[i]->texture;
+
+      if (res && res->target != PIPE_BUFFER) {
+         struct si_texture *tex = (struct si_texture *)res;
+
+         if (color_needs_decompression(tex)) {
+            samplers->needs_color_decompress_mask |= 1u << i;
+         } else {
+            samplers->needs_color_decompress_mask &= ~(1u << i);
+         }
+      }
+   }
  }
  
  /* IMAGE VIEWS */
  
-static void
-si_release_image_views(struct si_images *images)
+static void si_release_image_views(struct si_images *images)
  {
-       unsigned i;
+   unsigned i;
  
-       for (i = 0; i < SI_NUM_IMAGES; ++i) {
-               struct pipe_image_view *view = &images->views[i];
+   for (i = 0; i < SI_NUM_IMAGES; ++i) {
+      struct pipe_image_view *view = &images->views[i];
  
-               pipe_resource_reference(&view->resource, NULL);
-       }
+      pipe_resource_reference(&view->resource, NULL);
+   }
  }
  
-static void
-si_image_views_begin_new_cs(struct si_context *sctx, struct si_images *images)
+static void si_image_views_begin_new_cs(struct si_context *sctx, struct si_images *images)
  {
-       uint mask = images->enabled_mask;
+   uint mask = images->enabled_mask;
+
+   /* Add buffers to the CS. */
+   while (mask) {
+      int i = u_bit_scan(&mask);
+      struct pipe_image_view *view = &images->views[i];
  
-       /* Add buffers to the CS. */
-       while (mask) {
-               int i = u_bit_scan(&mask);
-               struct pipe_image_view *view = &images->views[i];
+      assert(view->resource);
  
-               assert(view->resource);
+      si_sampler_view_add_buffer(sctx, view->resource, RADEON_USAGE_READWRITE, false, false);
+   }
+}
  
-               si_sampler_view_add_buffer(sctx, view->resource,
-                                          RADEON_USAGE_READWRITE, false, false);
-       }
-}
-
-static void
-si_disable_shader_image(struct si_context *ctx, unsigned shader, unsigned slot)
-{
-       struct si_images *images = &ctx->images[shader];
-
-       if (images->enabled_mask & (1u << slot)) {
-               struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader);
-               unsigned desc_slot = si_get_image_slot(slot);
-
-               pipe_resource_reference(&images->views[slot].resource, NULL);
-               images->needs_color_decompress_mask &= ~(1 << slot);
-
-               memcpy(descs->list + desc_slot*8, null_image_descriptor, 8*4);
-               images->enabled_mask &= ~(1u << slot);
-               ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
-       }
-}
-
-static void
-si_mark_image_range_valid(const struct pipe_image_view *view)
-{
-       struct si_resource *res = si_resource(view->resource);
-
-       if (res->b.b.target != PIPE_BUFFER)
-               return;
-
-       util_range_add(&res->b.b, &res->valid_buffer_range,
-                      view->u.buf.offset,
-                      view->u.buf.offset + view->u.buf.size);
-}
-
-static void si_set_shader_image_desc(struct si_context *ctx,
-                                    const struct pipe_image_view *view,
-                                    bool skip_decompress,
-                                    uint32_t *desc, uint32_t *fmask_desc)
-{
-       struct si_screen *screen = ctx->screen;
-       struct si_resource *res;
-
-       res = si_resource(view->resource);
-
-       if (res->b.b.target == PIPE_BUFFER ||
-           view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
-               if (view->access & PIPE_IMAGE_ACCESS_WRITE)
-                       si_mark_image_range_valid(view);
-
-               si_make_buffer_descriptor(screen, res,
-                                         view->format,
-                                         view->u.buf.offset,
-                                         view->u.buf.size, desc);
-               si_set_buf_desc_address(res, view->u.buf.offset, desc + 4);
-       } else {
-               static const unsigned char swizzle[4] = { 0, 1, 2, 3 };
-               struct si_texture *tex = (struct si_texture *)res;
-               unsigned level = view->u.tex.level;
-               unsigned width, height, depth, hw_level;
-               bool uses_dcc = vi_dcc_enabled(tex, level);
-               unsigned access = view->access;
-
-               assert(!tex->is_depth);
-               assert(fmask_desc || tex->surface.fmask_offset == 0);
-
-               if (uses_dcc && !skip_decompress &&
-                   (access & PIPE_IMAGE_ACCESS_WRITE ||
-                    !vi_dcc_formats_compatible(screen, res->b.b.format, view->format))) {
-                       /* If DCC can't be disabled, at least decompress it.
-                        * The decompression is relatively cheap if the surface
-                        * has been decompressed already.
-                        */
-                       if (!si_texture_disable_dcc(ctx, tex))
-                               si_decompress_dcc(ctx, tex);
-               }
-
-               if (ctx->chip_class >= GFX9) {
-                       /* Always set the base address. The swizzle modes don't
-                        * allow setting mipmap level offsets as the base.
-                        */
-                       width = res->b.b.width0;
-                       height = res->b.b.height0;
-                       depth = res->b.b.depth0;
-                       hw_level = level;
-               } else {
-                       /* Always force the base level to the selected level.
-                        *
-                        * This is required for 3D textures, where otherwise
-                        * selecting a single slice for non-layered bindings
-                        * fails. It doesn't hurt the other targets.
-                        */
-                       width = u_minify(res->b.b.width0, level);
-                       height = u_minify(res->b.b.height0, level);
-                       depth = u_minify(res->b.b.depth0, level);
-                       hw_level = 0;
-               }
-
-               screen->make_texture_descriptor(screen, tex,
-                                          false, res->b.b.target,
-                                          view->format, swizzle,
-                                          hw_level, hw_level,
-                                          view->u.tex.first_layer,
-                                          view->u.tex.last_layer,
-                                          width, height, depth,
-                                          desc, fmask_desc);
-               si_set_mutable_tex_desc_fields(screen, tex,
-                                              &tex->surface.u.legacy.level[level],
-                                              level, level,
-                                              util_format_get_blockwidth(view->format),
-                                              false, desc);
-       }
-}
-
-static void si_set_shader_image(struct si_context *ctx,
-                               unsigned shader,
-                               unsigned slot, const struct pipe_image_view *view,
-                               bool skip_decompress)
-{
-       struct si_images *images = &ctx->images[shader];
-       struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader);
-       struct si_resource *res;
-
-       if (!view || !view->resource) {
-               si_disable_shader_image(ctx, shader, slot);
-               return;
-       }
-
-       res = si_resource(view->resource);
-
-       if (&images->views[slot] != view)
-               util_copy_image_view(&images->views[slot], view);
-
-       si_set_shader_image_desc(ctx, view, skip_decompress,
-                                descs->list + si_get_image_slot(slot) * 8,
-                                descs->list + si_get_image_slot(slot + SI_NUM_IMAGES) * 8);
-
-       if (res->b.b.target == PIPE_BUFFER ||
-           view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
-               images->needs_color_decompress_mask &= ~(1 << slot);
-               res->bind_history |= PIPE_BIND_SHADER_IMAGE;
-       } else {
-               struct si_texture *tex = (struct si_texture *)res;
-               unsigned level = view->u.tex.level;
-
-               if (color_needs_decompression(tex)) {
-                       images->needs_color_decompress_mask |= 1 << slot;
-               } else {
-                       images->needs_color_decompress_mask &= ~(1 << slot);
-               }
-
-               if (vi_dcc_enabled(tex, level) &&
-                   p_atomic_read(&tex->framebuffers_bound))
-                       ctx->need_check_render_feedback = true;
-       }
-
-       images->enabled_mask |= 1u << slot;
-       ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
-
-       /* Since this can flush, it must be done after enabled_mask is updated. */
-       si_sampler_view_add_buffer(ctx, &res->b.b,
-                                  (view->access & PIPE_IMAGE_ACCESS_WRITE) ?
-                                  RADEON_USAGE_READWRITE : RADEON_USAGE_READ,
-                                  false, true);
-}
-
-static void
-si_set_shader_images(struct pipe_context *pipe,
-                    enum pipe_shader_type shader,
-                    unsigned start_slot, unsigned count,
-                    const struct pipe_image_view *views)
-{
-       struct si_context *ctx = (struct si_context *)pipe;
-       unsigned i, slot;
-
-       assert(shader < SI_NUM_SHADERS);
-
-       if (!count)
-               return;
-
-       assert(start_slot + count <= SI_NUM_IMAGES);
-
-       if (views) {
-               for (i = 0, slot = start_slot; i < count; ++i, ++slot)
-                       si_set_shader_image(ctx, shader, slot, &views[i], false);
-       } else {
-               for (i = 0, slot = start_slot; i < count; ++i, ++slot)
-                       si_set_shader_image(ctx, shader, slot, NULL, false);
-       }
-
-       si_update_shader_needs_decompress_mask(ctx, shader);
-}
-
-static void
-si_images_update_needs_color_decompress_mask(struct si_images *images)
+static void si_disable_shader_image(struct si_context *ctx, unsigned shader, unsigned slot)
  {
-       unsigned mask = images->enabled_mask;
+   struct si_images *images = &ctx->images[shader];
+
+   if (images->enabled_mask & (1u << slot)) {
+      struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader);
+      unsigned desc_slot = si_get_image_slot(slot);
+
+      pipe_resource_reference(&images->views[slot].resource, NULL);
+      images->needs_color_decompress_mask &= ~(1 << slot);
+
+      memcpy(descs->list + desc_slot * 8, null_image_descriptor, 8 * 4);
+      images->enabled_mask &= ~(1u << slot);
+      ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+   }
+}
  
-       while (mask) {
-               int i = u_bit_scan(&mask);
-               struct pipe_resource *res = images->views[i].resource;
+static void si_mark_image_range_valid(const struct pipe_image_view *view)
+{
+   struct si_resource *res = si_resource(view->resource);
  
-               if (res && res->target != PIPE_BUFFER) {
-                       struct si_texture *tex = (struct si_texture *)res;
+   if (res->b.b.target != PIPE_BUFFER)
+      return;
  
-                       if (color_needs_decompression(tex)) {
-                               images->needs_color_decompress_mask |= 1 << i;
-                       } else {
-                               images->needs_color_decompress_mask &= ~(1 << i);
-                       }
-               }
-       }
+   util_range_add(&res->b.b, &res->valid_buffer_range, view->u.buf.offset,
+                  view->u.buf.offset + view->u.buf.size);
+}
+
+static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_image_view *view,
+                                     bool skip_decompress, uint32_t *desc, uint32_t *fmask_desc)
+{
+   struct si_screen *screen = ctx->screen;
+   struct si_resource *res;
+
+   res = si_resource(view->resource);
+
+   if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
+      if (view->access & PIPE_IMAGE_ACCESS_WRITE)
+         si_mark_image_range_valid(view);
+
+      si_make_buffer_descriptor(screen, res, view->format, view->u.buf.offset, view->u.buf.size,
+                                desc);
+      si_set_buf_desc_address(res, view->u.buf.offset, desc + 4);
+   } else {
+      static const unsigned char swizzle[4] = {0, 1, 2, 3};
+      struct si_texture *tex = (struct si_texture *)res;
+      unsigned level = view->u.tex.level;
+      unsigned width, height, depth, hw_level;
+      bool uses_dcc = vi_dcc_enabled(tex, level);
+      unsigned access = view->access;
+
+      assert(!tex->is_depth);
+      assert(fmask_desc || tex->surface.fmask_offset == 0);
+
+      if (uses_dcc && !skip_decompress &&
+          (access & PIPE_IMAGE_ACCESS_WRITE ||
+           !vi_dcc_formats_compatible(screen, res->b.b.format, view->format))) {
+         /* If DCC can't be disabled, at least decompress it.
+          * The decompression is relatively cheap if the surface
+          * has been decompressed already.
+          */
+         if (!si_texture_disable_dcc(ctx, tex))
+            si_decompress_dcc(ctx, tex);
+      }
+
+      if (ctx->chip_class >= GFX9) {
+         /* Always set the base address. The swizzle modes don't
+          * allow setting mipmap level offsets as the base.
+          */
+         width = res->b.b.width0;
+         height = res->b.b.height0;
+         depth = res->b.b.depth0;
+         hw_level = level;
+      } else {
+         /* Always force the base level to the selected level.
+          *
+          * This is required for 3D textures, where otherwise
+          * selecting a single slice for non-layered bindings
+          * fails. It doesn't hurt the other targets.
+          */
+         width = u_minify(res->b.b.width0, level);
+         height = u_minify(res->b.b.height0, level);
+         depth = u_minify(res->b.b.depth0, level);
+         hw_level = 0;
+      }
+
+      screen->make_texture_descriptor(
+         screen, tex, false, res->b.b.target, view->format, swizzle, hw_level, hw_level,
+         view->u.tex.first_layer, view->u.tex.last_layer, width, height, depth, desc, fmask_desc);
+      si_set_mutable_tex_desc_fields(screen, tex, &tex->surface.u.legacy.level[level], level, level,
+                                     util_format_get_blockwidth(view->format), false, desc);
+   }
+}
+
+static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigned slot,
+                                const struct pipe_image_view *view, bool skip_decompress)
+{
+   struct si_images *images = &ctx->images[shader];
+   struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader);
+   struct si_resource *res;
+
+   if (!view || !view->resource) {
+      si_disable_shader_image(ctx, shader, slot);
+      return;
+   }
+
+   res = si_resource(view->resource);
+
+   if (&images->views[slot] != view)
+      util_copy_image_view(&images->views[slot], view);
+
+   si_set_shader_image_desc(ctx, view, skip_decompress, descs->list + si_get_image_slot(slot) * 8,
+                            descs->list + si_get_image_slot(slot + SI_NUM_IMAGES) * 8);
+
+   if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
+      images->needs_color_decompress_mask &= ~(1 << slot);
+      res->bind_history |= PIPE_BIND_SHADER_IMAGE;
+   } else {
+      struct si_texture *tex = (struct si_texture *)res;
+      unsigned level = view->u.tex.level;
+
+      if (color_needs_decompression(tex)) {
+         images->needs_color_decompress_mask |= 1 << slot;
+      } else {
+         images->needs_color_decompress_mask &= ~(1 << slot);
+      }
+
+      if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound))
+         ctx->need_check_render_feedback = true;
+   }
+
+   images->enabled_mask |= 1u << slot;
+   ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+
+   /* Since this can flush, it must be done after enabled_mask is updated. */
+   si_sampler_view_add_buffer(
+      ctx, &res->b.b,
+      (view->access & PIPE_IMAGE_ACCESS_WRITE) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, false,
+      true);
+}
+
+static void si_set_shader_images(struct pipe_context *pipe, enum pipe_shader_type shader,
+                                 unsigned start_slot, unsigned count,
+                                 const struct pipe_image_view *views)
+{
+   struct si_context *ctx = (struct si_context *)pipe;
+   unsigned i, slot;
+
+   assert(shader < SI_NUM_SHADERS);
+
+   if (!count)
+      return;
+
+   assert(start_slot + count <= SI_NUM_IMAGES);
+
+   if (views) {
+      for (i = 0, slot = start_slot; i < count; ++i, ++slot)
+         si_set_shader_image(ctx, shader, slot, &views[i], false);
+   } else {
+      for (i = 0, slot = start_slot; i < count; ++i, ++slot)
+         si_set_shader_image(ctx, shader, slot, NULL, false);
+   }
+
+   si_update_shader_needs_decompress_mask(ctx, shader);
+}
+
+static void si_images_update_needs_color_decompress_mask(struct si_images *images)
+{
+   unsigned mask = images->enabled_mask;
+
+   while (mask) {
+      int i = u_bit_scan(&mask);
+      struct pipe_resource *res = images->views[i].resource;
+
+      if (res && res->target != PIPE_BUFFER) {
+         struct si_texture *tex = (struct si_texture *)res;
+
+         if (color_needs_decompression(tex)) {
+            images->needs_color_decompress_mask |= 1 << i;
+         } else {
+            images->needs_color_decompress_mask &= ~(1 << i);
+         }
+      }
+   }
  }
  
  void si_update_ps_colorbuf0_slot(struct si_context *sctx)
  {
-       struct si_buffer_resources *buffers = &sctx->rw_buffers;
-       struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
-       unsigned slot = SI_PS_IMAGE_COLORBUF0;
-       struct pipe_surface *surf = NULL;
-
-       /* si_texture_disable_dcc can get us here again. */
-       if (sctx->blitter->running)
-               return;
-
-       /* See whether FBFETCH is used and color buffer 0 is set. */
-       if (sctx->ps_shader.cso &&
-           sctx->ps_shader.cso->info.uses_fbfetch &&
-           sctx->framebuffer.state.nr_cbufs &&
-           sctx->framebuffer.state.cbufs[0])
-               surf = sctx->framebuffer.state.cbufs[0];
-
-       /* Return if FBFETCH transitions from disabled to disabled. */
-       if (!buffers->buffers[slot] && !surf)
-               return;
-
-       sctx->ps_uses_fbfetch = surf != NULL;
-       si_update_ps_iter_samples(sctx);
-
-       if (surf) {
-               struct si_texture *tex = (struct si_texture*)surf->texture;
-               struct pipe_image_view view = {0};
-
-               assert(tex);
-               assert(!tex->is_depth);
-
-               /* Disable DCC, because the texture is used as both a sampler
-                * and color buffer.
-                */
-               si_texture_disable_dcc(sctx, tex);
-
-               if (tex->buffer.b.b.nr_samples <= 1 && tex->cmask_buffer) {
-                       /* Disable CMASK. */
-                       assert(tex->cmask_buffer != &tex->buffer);
-                       si_eliminate_fast_color_clear(sctx, tex);
-                       si_texture_discard_cmask(sctx->screen, tex);
-               }
-
-               view.resource = surf->texture;
-               view.format = surf->format;
-               view.access = PIPE_IMAGE_ACCESS_READ;
-               view.u.tex.first_layer = surf->u.tex.first_layer;
-               view.u.tex.last_layer = surf->u.tex.last_layer;
-               view.u.tex.level = surf->u.tex.level;
-
-               /* Set the descriptor. */
-               uint32_t *desc = descs->list + slot*4;
-               memset(desc, 0, 16 * 4);
-               si_set_shader_image_desc(sctx, &view, true, desc, desc + 8);
-
-               pipe_resource_reference(&buffers->buffers[slot], &tex->buffer.b.b);
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                         &tex->buffer, RADEON_USAGE_READ,
-                                         RADEON_PRIO_SHADER_RW_IMAGE);
-               buffers->enabled_mask |= 1u << slot;
-       } else {
-               /* Clear the descriptor. */
-               memset(descs->list + slot*4, 0, 8*4);
-               pipe_resource_reference(&buffers->buffers[slot], NULL);
-               buffers->enabled_mask &= ~(1u << slot);
-       }
-
-       sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
+   struct si_buffer_resources *buffers = &sctx->rw_buffers;
+   struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
+   unsigned slot = SI_PS_IMAGE_COLORBUF0;
+   struct pipe_surface *surf = NULL;
+
+   /* si_texture_disable_dcc can get us here again. */
+   if (sctx->blitter->running)
+      return;
+
+   /* See whether FBFETCH is used and color buffer 0 is set. */
+   if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_fbfetch &&
+       sctx->framebuffer.state.nr_cbufs && sctx->framebuffer.state.cbufs[0])
+      surf = sctx->framebuffer.state.cbufs[0];
+
+   /* Return if FBFETCH transitions from disabled to disabled. */
+   if (!buffers->buffers[slot] && !surf)
+      return;
+
+   sctx->ps_uses_fbfetch = surf != NULL;
+   si_update_ps_iter_samples(sctx);
+
+   if (surf) {
+      struct si_texture *tex = (struct si_texture *)surf->texture;
+      struct pipe_image_view view = {0};
+
+      assert(tex);
+      assert(!tex->is_depth);
+
+      /* Disable DCC, because the texture is used as both a sampler
+       * and color buffer.
+       */
+      si_texture_disable_dcc(sctx, tex);
+
+      if (tex->buffer.b.b.nr_samples <= 1 && tex->cmask_buffer) {
+         /* Disable CMASK. */
+         assert(tex->cmask_buffer != &tex->buffer);
+         si_eliminate_fast_color_clear(sctx, tex);
+         si_texture_discard_cmask(sctx->screen, tex);
+      }
+
+      view.resource = surf->texture;
+      view.format = surf->format;
+      view.access = PIPE_IMAGE_ACCESS_READ;
+      view.u.tex.first_layer = surf->u.tex.first_layer;
+      view.u.tex.last_layer = surf->u.tex.last_layer;
+      view.u.tex.level = surf->u.tex.level;
+
+      /* Set the descriptor. */
+      uint32_t *desc = descs->list + slot * 4;
+      memset(desc, 0, 16 * 4);
+      si_set_shader_image_desc(sctx, &view, true, desc, desc + 8);
+
+      pipe_resource_reference(&buffers->buffers[slot], &tex->buffer.b.b);
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READ,
+                                RADEON_PRIO_SHADER_RW_IMAGE);
+      buffers->enabled_mask |= 1u << slot;
+   } else {
+      /* Clear the descriptor. */
+      memset(descs->list + slot * 4, 0, 8 * 4);
+      pipe_resource_reference(&buffers->buffers[slot], NULL);
+      buffers->enabled_mask &= ~(1u << slot);
+   }
+
+   sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
  }
  
  /* SAMPLER STATES */
  
-static void si_bind_sampler_states(struct pipe_context *ctx,
-                                   enum pipe_shader_type shader,
+static void si_bind_sampler_states(struct pipe_context *ctx, enum pipe_shader_type shader,
                                     unsigned start, unsigned count, void **states)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_samplers *samplers = &sctx->samplers[shader];
-       struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, shader);
-       struct si_sampler_state **sstates = (struct si_sampler_state**)states;
-       int i;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_samplers *samplers = &sctx->samplers[shader];
+   struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, shader);
+   struct si_sampler_state **sstates = (struct si_sampler_state **)states;
+   int i;
  
-       if (!count || shader >= SI_NUM_SHADERS || !sstates)
-               return;
+   if (!count || shader >= SI_NUM_SHADERS || !sstates)
+      return;
  
-       for (i = 0; i < count; i++) {
-               unsigned slot = start + i;
-               unsigned desc_slot = si_get_sampler_slot(slot);
+   for (i = 0; i < count; i++) {
+      unsigned slot = start + i;
+      unsigned desc_slot = si_get_sampler_slot(slot);
  
-               if (!sstates[i] ||
-                   sstates[i] == samplers->sampler_states[slot])
-                       continue;
+      if (!sstates[i] || sstates[i] == samplers->sampler_states[slot])
+         continue;
  
  #ifndef NDEBUG
-               assert(sstates[i]->magic == SI_SAMPLER_STATE_MAGIC);
+      assert(sstates[i]->magic == SI_SAMPLER_STATE_MAGIC);
  #endif
-               samplers->sampler_states[slot] = sstates[i];
+      samplers->sampler_states[slot] = sstates[i];
  
-               /* If FMASK is bound, don't overwrite it.
-                * The sampler state will be set after FMASK is unbound.
-                */
-               struct si_sampler_view *sview =
-                       (struct si_sampler_view *)samplers->views[slot];
+      /* If FMASK is bound, don't overwrite it.
+       * The sampler state will be set after FMASK is unbound.
+       */
+      struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[slot];
  
-               struct si_texture *tex = NULL;
+      struct si_texture *tex = NULL;
  
-               if (sview && sview->base.texture &&
-                   sview->base.texture->target != PIPE_BUFFER)
-                       tex = (struct si_texture *)sview->base.texture;
+      if (sview && sview->base.texture && sview->base.texture->target != PIPE_BUFFER)
+         tex = (struct si_texture *)sview->base.texture;
  
-               if (tex && tex->surface.fmask_size)
-                       continue;
+      if (tex && tex->surface.fmask_size)
+         continue;
  
-               si_set_sampler_state_desc(sstates[i], sview, tex,
-                                         desc->list + desc_slot * 16 + 12);
+      si_set_sampler_state_desc(sstates[i], sview, tex, desc->list + desc_slot * 16 + 12);
  
-               sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
-       }
+      sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+   }
  }
  
  /* BUFFER RESOURCES */
  
  static void si_init_buffer_resources(struct si_buffer_resources *buffers,
-                                    struct si_descriptors *descs,
-                                    unsigned num_buffers,
-                                    short shader_userdata_rel_index,
-                                    enum radeon_bo_priority priority,
-                                    enum radeon_bo_priority priority_constbuf)
+                                     struct si_descriptors *descs, unsigned num_buffers,
+                                     short shader_userdata_rel_index,
+                                     enum radeon_bo_priority priority,
+                                     enum radeon_bo_priority priority_constbuf)
  {
-       buffers->priority = priority;
-       buffers->priority_constbuf = priority_constbuf;
-       buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
-       buffers->offsets = CALLOC(num_buffers, sizeof(buffers->offsets[0]));
+   buffers->priority = priority;
+   buffers->priority_constbuf = priority_constbuf;
+   buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource *));
+   buffers->offsets = CALLOC(num_buffers, sizeof(buffers->offsets[0]));
  
-       si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers);
+   si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers);
  }
  
  static void si_release_buffer_resources(struct si_buffer_resources *buffers,
-                                       struct si_descriptors *descs)
+                                        struct si_descriptors *descs)
  {
-       int i;
+   int i;
  
-       for (i = 0; i < descs->num_elements; i++) {
-               pipe_resource_reference(&buffers->buffers[i], NULL);
-       }
+   for (i = 0; i < descs->num_elements; i++) {
+      pipe_resource_reference(&buffers->buffers[i], NULL);
+   }
  
-       FREE(buffers->buffers);
-       FREE(buffers->offsets);
+   FREE(buffers->buffers);
+   FREE(buffers->offsets);
  }
  
  static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
-                                            struct si_buffer_resources *buffers)
+                                             struct si_buffer_resources *buffers)
  {
-       unsigned mask = buffers->enabled_mask;
+   unsigned mask = buffers->enabled_mask;
  
-       /* Add buffers to the CS. */
-       while (mask) {
-               int i = u_bit_scan(&mask);
+   /* Add buffers to the CS. */
+   while (mask) {
+      int i = u_bit_scan(&mask);
  
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                       si_resource(buffers->buffers[i]),
-                       buffers->writable_mask & (1u << i) ? RADEON_USAGE_READWRITE :
-                                                            RADEON_USAGE_READ,
-                       i < SI_NUM_SHADER_BUFFERS ? buffers->priority :
-                                                   buffers->priority_constbuf);
-       }
+      radeon_add_to_buffer_list(
+         sctx, sctx->gfx_cs, si_resource(buffers->buffers[i]),
+         buffers->writable_mask & (1u << i) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ,
+         i < SI_NUM_SHADER_BUFFERS ? buffers->priority : buffers->priority_constbuf);
+   }
  }
  
  static void si_get_buffer_from_descriptors(struct si_buffer_resources *buffers,
-                                          struct si_descriptors *descs,
-                                          unsigned idx, struct pipe_resource **buf,
-                                          unsigned *offset, unsigned *size)
+                                           struct si_descriptors *descs, unsigned idx,
+                                           struct pipe_resource **buf, unsigned *offset,
+                                           unsigned *size)
  {
-       pipe_resource_reference(buf, buffers->buffers[idx]);
-       if (*buf) {
-               struct si_resource *res = si_resource(*buf);
-               const uint32_t *desc = descs->list + idx * 4;
-               uint64_t va;
+   pipe_resource_reference(buf, buffers->buffers[idx]);
+   if (*buf) {
+      struct si_resource *res = si_resource(*buf);
+      const uint32_t *desc = descs->list + idx * 4;
+      uint64_t va;
  
-               *size = desc[2];
+      *size = desc[2];
  
-               assert(G_008F04_STRIDE(desc[1]) == 0);
-               va = si_desc_extract_buffer_address(desc);
+      assert(G_008F04_STRIDE(desc[1]) == 0);
+      va = si_desc_extract_buffer_address(desc);
  
-               assert(va >= res->gpu_address && va + *size <= res->gpu_address + res->bo_size);
-               *offset = va - res->gpu_address;
-       }
+      assert(va >= res->gpu_address && va + *size <= res->gpu_address + res->bo_size);
+      *offset = va - res->gpu_address;
+   }
  }
  
  /* VERTEX BUFFERS */
  
  static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
  {
-       int count = sctx->num_vertex_elements;
-       int i;
+   int count = sctx->num_vertex_elements;
+   int i;
  
-       for (i = 0; i < count; i++) {
-               int vb = sctx->vertex_elements->vertex_buffer_index[i];
+   for (i = 0; i < count; i++) {
+      int vb = sctx->vertex_elements->vertex_buffer_index[i];
  
-               if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
-                       continue;
-               if (!sctx->vertex_buffer[vb].buffer.resource)
-                       continue;
+      if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
+         continue;
+      if (!sctx->vertex_buffer[vb].buffer.resource)
+         continue;
  
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                     si_resource(sctx->vertex_buffer[vb].buffer.resource),
-                                     RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
-       }
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
+                                si_resource(sctx->vertex_buffer[vb].buffer.resource),
+                                RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
+   }
  
-       if (!sctx->vb_descriptors_buffer)
-               return;
-       radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                 sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
-                                 RADEON_PRIO_DESCRIPTORS);
+   if (!sctx->vb_descriptors_buffer)
+      return;
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
+                             RADEON_PRIO_DESCRIPTORS);
  }
  
  bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
  {
-       unsigned i, count = sctx->num_vertex_elements;
-       uint32_t *ptr;
-
-       if (!sctx->vertex_buffers_dirty || !count)
-               return true;
-
-       struct si_vertex_elements *velems = sctx->vertex_elements;
-       unsigned alloc_size = velems->vb_desc_list_alloc_size;
-
-       if (alloc_size) {
-               /* Vertex buffer descriptors are the only ones which are uploaded
-                * directly through a staging buffer and don't go through
-                * the fine-grained upload path.
-                */
-               u_upload_alloc(sctx->b.const_uploader, 0,
-                              alloc_size,
-                              si_optimal_tcc_alignment(sctx, alloc_size),
-                              &sctx->vb_descriptors_offset,
-                              (struct pipe_resource**)&sctx->vb_descriptors_buffer,
-                              (void**)&ptr);
-               if (!sctx->vb_descriptors_buffer) {
-                       sctx->vb_descriptors_offset = 0;
-                       sctx->vb_descriptors_gpu_list = NULL;
-                       return false;
-               }
-
-               sctx->vb_descriptors_gpu_list = ptr;
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                         sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
-                                         RADEON_PRIO_DESCRIPTORS);
-               sctx->vertex_buffer_pointer_dirty = true;
-               sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
-       } else {
-               si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
-               sctx->vertex_buffer_pointer_dirty = false;
-               sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS;
-       }
-
-       assert(count <= SI_MAX_ATTRIBS);
-
-       unsigned first_vb_use_mask = velems->first_vb_use_mask;
-       unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;
-
-       for (i = 0; i < count; i++) {
-               struct pipe_vertex_buffer *vb;
-               struct si_resource *buf;
-               unsigned vbo_index = velems->vertex_buffer_index[i];
-               uint32_t *desc = i < num_vbos_in_user_sgprs ?
-                                       &sctx->vb_descriptor_user_sgprs[i * 4] :
-                                       &ptr[(i - num_vbos_in_user_sgprs) * 4];
-
-               vb = &sctx->vertex_buffer[vbo_index];
-               buf = si_resource(vb->buffer.resource);
-               if (!buf) {
-                       memset(desc, 0, 16);
-                       continue;
-               }
-
-               int64_t offset = (int64_t)((int)vb->buffer_offset) +
-                                velems->src_offset[i];
-
-               if (offset >= buf->b.b.width0) {
-                       assert(offset < buf->b.b.width0);
-                       memset(desc, 0, 16);
-                       continue;
-               }
-
-               uint64_t va = buf->gpu_address + offset;
-
-               int64_t num_records = (int64_t)buf->b.b.width0 - offset;
-               if (sctx->chip_class != GFX8 && vb->stride) {
-                       /* Round up by rounding down and adding 1 */
-                       num_records = (num_records - velems->format_size[i]) /
-                                     vb->stride + 1;
-               }
-               assert(num_records >= 0 && num_records <= UINT_MAX);
-
-               uint32_t rsrc_word3 = velems->rsrc_word3[i];
-
-               /* OOB_SELECT chooses the out-of-bounds check:
-                *  - 1: index >= NUM_RECORDS (Structured)
-                *  - 3: offset >= NUM_RECORDS (Raw)
-                */
-               if (sctx->chip_class >= GFX10)
-                       rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW);
-
-               desc[0] = va;
-               desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
-                         S_008F04_STRIDE(vb->stride);
-               desc[2] = num_records;
-               desc[3] = rsrc_word3;
-
-               if (first_vb_use_mask & (1 << i)) {
-                       radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                             si_resource(vb->buffer.resource),
-                                             RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
-               }
-       }
-
-       /* Don't flush the const cache. It would have a very negative effect
-        * on performance (confirmed by testing). New descriptors are always
-        * uploaded to a fresh new buffer, so I don't think flushing the const
-        * cache is needed. */
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
-       sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
-       sctx->vertex_buffers_dirty = false;
-       return true;
+   unsigned i, count = sctx->num_vertex_elements;
+   uint32_t *ptr;
+
+   if (!sctx->vertex_buffers_dirty || !count)
+      return true;
+
+   struct si_vertex_elements *velems = sctx->vertex_elements;
+   unsigned alloc_size = velems->vb_desc_list_alloc_size;
+
+   if (alloc_size) {
+      /* Vertex buffer descriptors are the only ones which are uploaded
+       * directly through a staging buffer and don't go through
+       * the fine-grained upload path.
+       */
+      u_upload_alloc(sctx->b.const_uploader, 0, alloc_size,
+                     si_optimal_tcc_alignment(sctx, alloc_size), &sctx->vb_descriptors_offset,
+                     (struct pipe_resource **)&sctx->vb_descriptors_buffer, (void **)&ptr);
+      if (!sctx->vb_descriptors_buffer) {
+         sctx->vb_descriptors_offset = 0;
+         sctx->vb_descriptors_gpu_list = NULL;
+         return false;
+      }
+
+      sctx->vb_descriptors_gpu_list = ptr;
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
+                                RADEON_PRIO_DESCRIPTORS);
+      sctx->vertex_buffer_pointer_dirty = true;
+      sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
+   } else {
+      si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
+      sctx->vertex_buffer_pointer_dirty = false;
+      sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS;
+   }
+
+   assert(count <= SI_MAX_ATTRIBS);
+
+   unsigned first_vb_use_mask = velems->first_vb_use_mask;
+   unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;
+
+   for (i = 0; i < count; i++) {
+      struct pipe_vertex_buffer *vb;
+      struct si_resource *buf;
+      unsigned vbo_index = velems->vertex_buffer_index[i];
+      uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4]
+                                                  : &ptr[(i - num_vbos_in_user_sgprs) * 4];
+
+      vb = &sctx->vertex_buffer[vbo_index];
+      buf = si_resource(vb->buffer.resource);
+      if (!buf) {
+         memset(desc, 0, 16);
+         continue;
+      }
+
+      int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->src_offset[i];
+
+      if (offset >= buf->b.b.width0) {
+         assert(offset < buf->b.b.width0);
+         memset(desc, 0, 16);
+         continue;
+      }
+
+      uint64_t va = buf->gpu_address + offset;
+
+      int64_t num_records = (int64_t)buf->b.b.width0 - offset;
+      if (sctx->chip_class != GFX8 && vb->stride) {
+         /* Round up by rounding down and adding 1 */
+         num_records = (num_records - velems->format_size[i]) / vb->stride + 1;
+      }
+      assert(num_records >= 0 && num_records <= UINT_MAX);
+
+      uint32_t rsrc_word3 = velems->rsrc_word3[i];
+
+      /* OOB_SELECT chooses the out-of-bounds check:
+       *  - 1: index >= NUM_RECORDS (Structured)
+       *  - 3: offset >= NUM_RECORDS (Raw)
+       */
+      if (sctx->chip_class >= GFX10)
+         rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED
+                                                      : V_008F0C_OOB_SELECT_RAW);
+
+      desc[0] = va;
+      desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(vb->stride);
+      desc[2] = num_records;
+      desc[3] = rsrc_word3;
+
+      if (first_vb_use_mask & (1 << i)) {
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(vb->buffer.resource),
+                                   RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
+      }
+   }
+
+   /* Don't flush the const cache. It would have a very negative effect
+    * on performance (confirmed by testing). New descriptors are always
+    * uploaded to a fresh new buffer, so I don't think flushing the const
+    * cache is needed. */
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+   sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
+   sctx->vertex_buffers_dirty = false;
+   return true;
  }
  
-
  /* CONSTANT BUFFERS */
  
-static struct si_descriptors *
-si_const_and_shader_buffer_descriptors(struct si_context *sctx, unsigned shader)
-{
-       return &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(shader)];
-}
-
-void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf,
-                           const uint8_t *ptr, unsigned size, uint32_t *const_offset)
-{
-       void *tmp;
-
-       u_upload_alloc(sctx->b.const_uploader, 0, size,
-                      si_optimal_tcc_alignment(sctx, size),
-                      const_offset,
-                      (struct pipe_resource**)buf, &tmp);
-       if (*buf)
-               util_memcpy_cpu_to_le32(tmp, ptr, size);
-}
-
-static void si_set_constant_buffer(struct si_context *sctx,
-                                  struct si_buffer_resources *buffers,
-                                  unsigned descriptors_idx,
-                                  uint slot, const struct pipe_constant_buffer *input)
-{
-       struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
-       assert(slot < descs->num_elements);
-       pipe_resource_reference(&buffers->buffers[slot], NULL);
-
-       /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
-        * with a NULL buffer). We need to use a dummy buffer instead. */
-       if (sctx->chip_class == GFX7 &&
-           (!input || (!input->buffer && !input->user_buffer)))
-               input = &sctx->null_const_buf;
-
-       if (input && (input->buffer || input->user_buffer)) {
-               struct pipe_resource *buffer = NULL;
-               uint64_t va;
-               unsigned buffer_offset;
-
-               /* Upload the user buffer if needed. */
-               if (input->user_buffer) {
-                       si_upload_const_buffer(sctx,
-                                              (struct si_resource**)&buffer, input->user_buffer,
-                                              input->buffer_size, &buffer_offset);
-                       if (!buffer) {
-                               /* Just unbind on failure. */
-                               si_set_constant_buffer(sctx, buffers, descriptors_idx, slot, NULL);
-                               return;
-                       }
-               } else {
-                       pipe_resource_reference(&buffer, input->buffer);
-                       buffer_offset = input->buffer_offset;
-               }
-
-               va = si_resource(buffer)->gpu_address + buffer_offset;
-
-               /* Set the descriptor. */
-               uint32_t *desc = descs->list + slot*4;
-               desc[0] = va;
-               desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
-                         S_008F04_STRIDE(0);
-               desc[2] = input->buffer_size;
-               desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
-               if (sctx->chip_class >= GFX10) {
-                       desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                                  S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-                                  S_008F0C_RESOURCE_LEVEL(1);
-               } else {
-                       desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                                  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-               }
-
-               buffers->buffers[slot] = buffer;
-               buffers->offsets[slot] = buffer_offset;
-               radeon_add_to_gfx_buffer_list_check_mem(sctx,
-                                                       si_resource(buffer),
-                                                       RADEON_USAGE_READ,
-                                                       buffers->priority_constbuf, true);
-               buffers->enabled_mask |= 1u << slot;
-       } else {
-               /* Clear the descriptor. */
-               memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4);
-               buffers->enabled_mask &= ~(1u << slot);
-       }
-
-       sctx->descriptors_dirty |= 1u << descriptors_idx;
-}
-
-static void si_pipe_set_constant_buffer(struct pipe_context *ctx,
-                                       enum pipe_shader_type shader, uint slot,
-                                       const struct pipe_constant_buffer *input)
-{
-       struct si_context *sctx = (struct si_context *)ctx;
-
-       if (shader >= SI_NUM_SHADERS)
-               return;
-
-       if (slot == 0 && input && input->buffer &&
-           !(si_resource(input->buffer)->flags & RADEON_FLAG_32BIT)) {
-               assert(!"constant buffer 0 must have a 32-bit VM address, use const_uploader");
-               return;
-       }
-
-       if (input && input->buffer)
-               si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
-
-       slot = si_get_constbuf_slot(slot);
-       si_set_constant_buffer(sctx, &sctx->const_and_shader_buffers[shader],
-                              si_const_and_shader_buffer_descriptors_idx(shader),
-                              slot, input);
-}
-
-void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader,
-                                uint slot, struct pipe_constant_buffer *cbuf)
-{
-       cbuf->user_buffer = NULL;
-       si_get_buffer_from_descriptors(
-               &sctx->const_and_shader_buffers[shader],
-               si_const_and_shader_buffer_descriptors(sctx, shader),
-               si_get_constbuf_slot(slot),
-               &cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size);
+static struct si_descriptors *si_const_and_shader_buffer_descriptors(struct si_context *sctx,
+                                                                     unsigned shader)
+{
+   return &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(shader)];
+}
+
+void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf, const uint8_t *ptr,
+                            unsigned size, uint32_t *const_offset)
+{
+   void *tmp;
+
+   u_upload_alloc(sctx->b.const_uploader, 0, size, si_optimal_tcc_alignment(sctx, size),
+                  const_offset, (struct pipe_resource **)buf, &tmp);
+   if (*buf)
+      util_memcpy_cpu_to_le32(tmp, ptr, size);
+}
+
+static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_resources *buffers,
+                                   unsigned descriptors_idx, uint slot,
+                                   const struct pipe_constant_buffer *input)
+{
+   struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
+   assert(slot < descs->num_elements);
+   pipe_resource_reference(&buffers->buffers[slot], NULL);
+
+   /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
+    * with a NULL buffer). We need to use a dummy buffer instead. */
+   if (sctx->chip_class == GFX7 && (!input || (!input->buffer && !input->user_buffer)))
+      input = &sctx->null_const_buf;
+
+   if (input && (input->buffer || input->user_buffer)) {
+      struct pipe_resource *buffer = NULL;
+      uint64_t va;
+      unsigned buffer_offset;
+
+      /* Upload the user buffer if needed. */
+      if (input->user_buffer) {
+         si_upload_const_buffer(sctx, (struct si_resource **)&buffer, input->user_buffer,
+                                input->buffer_size, &buffer_offset);
+         if (!buffer) {
+            /* Just unbind on failure. */
+            si_set_constant_buffer(sctx, buffers, descriptors_idx, slot, NULL);
+            return;
+         }
+      } else {
+         pipe_resource_reference(&buffer, input->buffer);
+         buffer_offset = input->buffer_offset;
+      }
+
+      va = si_resource(buffer)->gpu_address + buffer_offset;
+
+      /* Set the descriptor. */
+      uint32_t *desc = descs->list + slot * 4;
+      desc[0] = va;
+      desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0);
+      desc[2] = input->buffer_size;
+      desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+      if (sctx->chip_class >= GFX10) {
+         desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+      } else {
+         desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+      }
+
+      buffers->buffers[slot] = buffer;
+      buffers->offsets[slot] = buffer_offset;
+      radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_READ,
+                                              buffers->priority_constbuf, true);
+      buffers->enabled_mask |= 1u << slot;
+   } else {
+      /* Clear the descriptor. */
+      memset(descs->list + slot * 4, 0, sizeof(uint32_t) * 4);
+      buffers->enabled_mask &= ~(1u << slot);
+   }
+
+   sctx->descriptors_dirty |= 1u << descriptors_idx;
+}
+
+static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shader_type shader,
+                                        uint slot, const struct pipe_constant_buffer *input)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   if (shader >= SI_NUM_SHADERS)
+      return;
+
+   if (slot == 0 && input && input->buffer &&
+       !(si_resource(input->buffer)->flags & RADEON_FLAG_32BIT)) {
+      assert(!"constant buffer 0 must have a 32-bit VM address, use const_uploader");
+      return;
+   }
+
+   if (input && input->buffer)
+      si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
+
+   slot = si_get_constbuf_slot(slot);
+   si_set_constant_buffer(sctx, &sctx->const_and_shader_buffers[shader],
+                          si_const_and_shader_buffer_descriptors_idx(shader), slot, input);
+}
+
+void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot,
+                                 struct pipe_constant_buffer *cbuf)
+{
+   cbuf->user_buffer = NULL;
+   si_get_buffer_from_descriptors(
+      &sctx->const_and_shader_buffers[shader], si_const_and_shader_buffer_descriptors(sctx, shader),
+      si_get_constbuf_slot(slot), &cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size);
  }
  
  /* SHADER BUFFERS */
  
-static void si_set_shader_buffer(struct si_context *sctx,
-                                struct si_buffer_resources *buffers,
-                                unsigned descriptors_idx,
-                                uint slot, const struct pipe_shader_buffer *sbuffer,
-                                bool writable, enum radeon_bo_priority priority)
-{
-       struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
-       uint32_t *desc = descs->list + slot * 4;
-
-       if (!sbuffer || !sbuffer->buffer) {
-               pipe_resource_reference(&buffers->buffers[slot], NULL);
-               memset(desc, 0, sizeof(uint32_t) * 4);
-               buffers->enabled_mask &= ~(1u << slot);
-               buffers->writable_mask &= ~(1u << slot);
-               sctx->descriptors_dirty |= 1u << descriptors_idx;
-               return;
-       }
-
-       struct si_resource *buf = si_resource(sbuffer->buffer);
-       uint64_t va = buf->gpu_address + sbuffer->buffer_offset;
-
-       desc[0] = va;
-       desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
-                 S_008F04_STRIDE(0);
-       desc[2] = sbuffer->buffer_size;
-       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
-       if (sctx->chip_class >= GFX10) {
-               desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-                          S_008F0C_RESOURCE_LEVEL(1);
-       } else {
-               desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-       }
-
-       pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
-       buffers->offsets[slot] = sbuffer->buffer_offset;
-       radeon_add_to_gfx_buffer_list_check_mem(sctx, buf,
-                                               writable ? RADEON_USAGE_READWRITE :
-                                                          RADEON_USAGE_READ,
-                                               priority, true);
-       if (writable)
-               buffers->writable_mask |= 1u << slot;
-       else
-               buffers->writable_mask &= ~(1u << slot);
-
-       buffers->enabled_mask |= 1u << slot;
-       sctx->descriptors_dirty |= 1u << descriptors_idx;
-
-       util_range_add(&buf->b.b, &buf->valid_buffer_range, sbuffer->buffer_offset,
-                      sbuffer->buffer_offset + sbuffer->buffer_size);
-}
-
-static void si_set_shader_buffers(struct pipe_context *ctx,
-                                 enum pipe_shader_type shader,
-                                 unsigned start_slot, unsigned count,
-                                 const struct pipe_shader_buffer *sbuffers,
-                                 unsigned writable_bitmask)
-{
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
-       unsigned descriptors_idx = si_const_and_shader_buffer_descriptors_idx(shader);
-       unsigned i;
-
-       assert(start_slot + count <= SI_NUM_SHADER_BUFFERS);
-
-       for (i = 0; i < count; ++i) {
-               const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
-               unsigned slot = si_get_shaderbuf_slot(start_slot + i);
-
-               if (sbuffer && sbuffer->buffer)
-                       si_resource(sbuffer->buffer)->bind_history |= PIPE_BIND_SHADER_BUFFER;
-
-               si_set_shader_buffer(sctx, buffers, descriptors_idx, slot, sbuffer,
-                                    !!(writable_bitmask & (1u << i)),
-                                    buffers->priority);
-       }
-}
-
-void si_get_shader_buffers(struct si_context *sctx,
-                          enum pipe_shader_type shader,
-                          uint start_slot, uint count,
-                          struct pipe_shader_buffer *sbuf)
-{
-       struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
-       struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader);
-
-       for (unsigned i = 0; i < count; ++i) {
-               si_get_buffer_from_descriptors(
-                       buffers, descs,
-                       si_get_shaderbuf_slot(start_slot + i),
-                       &sbuf[i].buffer, &sbuf[i].buffer_offset,
-                       &sbuf[i].buffer_size);
-       }
+static void si_set_shader_buffer(struct si_context *sctx, struct si_buffer_resources *buffers,
+                                 unsigned descriptors_idx, uint slot,
+                                 const struct pipe_shader_buffer *sbuffer, bool writable,
+                                 enum radeon_bo_priority priority)
+{
+   struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
+   uint32_t *desc = descs->list + slot * 4;
+
+   if (!sbuffer || !sbuffer->buffer) {
+      pipe_resource_reference(&buffers->buffers[slot], NULL);
+      memset(desc, 0, sizeof(uint32_t) * 4);
+      buffers->enabled_mask &= ~(1u << slot);
+      buffers->writable_mask &= ~(1u << slot);
+      sctx->descriptors_dirty |= 1u << descriptors_idx;
+      return;
+   }
+
+   struct si_resource *buf = si_resource(sbuffer->buffer);
+   uint64_t va = buf->gpu_address + sbuffer->buffer_offset;
+
+   desc[0] = va;
+   desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0);
+   desc[2] = sbuffer->buffer_size;
+   desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+             S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+   if (sctx->chip_class >= GFX10) {
+      desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+   } else {
+      desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+   }
+
+   pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
+   buffers->offsets[slot] = sbuffer->buffer_offset;
+   radeon_add_to_gfx_buffer_list_check_mem(
+      sctx, buf, writable ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, priority, true);
+   if (writable)
+      buffers->writable_mask |= 1u << slot;
+   else
+      buffers->writable_mask &= ~(1u << slot);
+
+   buffers->enabled_mask |= 1u << slot;
+   sctx->descriptors_dirty |= 1u << descriptors_idx;
+
+   util_range_add(&buf->b.b, &buf->valid_buffer_range, sbuffer->buffer_offset,
+                  sbuffer->buffer_offset + sbuffer->buffer_size);
+}
+
+static void si_set_shader_buffers(struct pipe_context *ctx, enum pipe_shader_type shader,
+                                  unsigned start_slot, unsigned count,
+                                  const struct pipe_shader_buffer *sbuffers,
+                                  unsigned writable_bitmask)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
+   unsigned descriptors_idx = si_const_and_shader_buffer_descriptors_idx(shader);
+   unsigned i;
+
+   assert(start_slot + count <= SI_NUM_SHADER_BUFFERS);
+
+   for (i = 0; i < count; ++i) {
+      const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
+      unsigned slot = si_get_shaderbuf_slot(start_slot + i);
+
+      if (sbuffer && sbuffer->buffer)
+         si_resource(sbuffer->buffer)->bind_history |= PIPE_BIND_SHADER_BUFFER;
+
+      si_set_shader_buffer(sctx, buffers, descriptors_idx, slot, sbuffer,
+                           !!(writable_bitmask & (1u << i)), buffers->priority);
+   }
+}
+
+void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot,
+                           uint count, struct pipe_shader_buffer *sbuf)
+{
+   struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
+   struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader);
+
+   for (unsigned i = 0; i < count; ++i) {
+      si_get_buffer_from_descriptors(buffers, descs, si_get_shaderbuf_slot(start_slot + i),
+                                     &sbuf[i].buffer, &sbuf[i].buffer_offset, &sbuf[i].buffer_size);
+   }
  }
  
  /* RING BUFFERS */
  
-void si_set_rw_buffer(struct si_context *sctx,
-                     uint slot, const struct pipe_constant_buffer *input)
+void si_set_rw_buffer(struct si_context *sctx, uint slot, const struct pipe_constant_buffer *input)
  {
-       si_set_constant_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS,
-                              slot, input);
+   si_set_constant_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS, slot, input);
  }
  
  void si_set_rw_shader_buffer(struct si_context *sctx, uint slot,
-                            const struct pipe_shader_buffer *sbuffer)
-{
-       si_set_shader_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS,
-                            slot, sbuffer, true, RADEON_PRIO_SHADER_RW_BUFFER);
-}
-
-void si_set_ring_buffer(struct si_context *sctx, uint slot,
-                       struct pipe_resource *buffer,
-                       unsigned stride, unsigned num_records,
-                       bool add_tid, bool swizzle,
-                       unsigned element_size, unsigned index_stride, uint64_t offset)
-{
-       struct si_buffer_resources *buffers = &sctx->rw_buffers;
-       struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
-
-       /* The stride field in the resource descriptor has 14 bits */
-       assert(stride < (1 << 14));
-
-       assert(slot < descs->num_elements);
-       pipe_resource_reference(&buffers->buffers[slot], NULL);
-
-       if (buffer) {
-               uint64_t va;
-
-               va = si_resource(buffer)->gpu_address + offset;
-
-               switch (element_size) {
-               default:
-                       assert(!"Unsupported ring buffer element size");
-               case 0:
-               case 2:
-                       element_size = 0;
-                       break;
-               case 4:
-                       element_size = 1;
-                       break;
-               case 8:
-                       element_size = 2;
-                       break;
-               case 16:
-                       element_size = 3;
-                       break;
-               }
-
-               switch (index_stride) {
-               default:
-                       assert(!"Unsupported ring buffer index stride");
-               case 0:
-               case 8:
-                       index_stride = 0;
-                       break;
-               case 16:
-                       index_stride = 1;
-                       break;
-               case 32:
-                       index_stride = 2;
-                       break;
-               case 64:
-                       index_stride = 3;
-                       break;
-               }
-
-               if (sctx->chip_class >= GFX8 && stride)
-                       num_records *= stride;
-
-               /* Set the descriptor. */
-               uint32_t *desc = descs->list + slot*4;
-               desc[0] = va;
-               desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
-                         S_008F04_STRIDE(stride) |
-                         S_008F04_SWIZZLE_ENABLE(swizzle);
-               desc[2] = num_records;
-               desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
-                         S_008F0C_INDEX_STRIDE(index_stride) |
-                         S_008F0C_ADD_TID_ENABLE(add_tid);
-
-               if (sctx->chip_class >= GFX9)
-                       assert(!swizzle || element_size == 1); /* always 4 bytes on GFX9 */
-               else
-                       desc[3] |= S_008F0C_ELEMENT_SIZE(element_size);
-
-               if (sctx->chip_class >= GFX10) {
-                       desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                                  S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
-                                  S_008F0C_RESOURCE_LEVEL(1);
-               } else {
-                       desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                                  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-               }
-
-               pipe_resource_reference(&buffers->buffers[slot], buffer);
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                     si_resource(buffer),
-                                     RADEON_USAGE_READWRITE, buffers->priority);
-               buffers->enabled_mask |= 1u << slot;
-       } else {
-               /* Clear the descriptor. */
-               memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4);
-               buffers->enabled_mask &= ~(1u << slot);
-       }
-
-       sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
+                             const struct pipe_shader_buffer *sbuffer)
+{
+   si_set_shader_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS, slot, sbuffer, true,
+                        RADEON_PRIO_SHADER_RW_BUFFER);
+}
+
+void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource *buffer,
+                        unsigned stride, unsigned num_records, bool add_tid, bool swizzle,
+                        unsigned element_size, unsigned index_stride, uint64_t offset)
+{
+   struct si_buffer_resources *buffers = &sctx->rw_buffers;
+   struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
+
+   /* The stride field in the resource descriptor has 14 bits */
+   assert(stride < (1 << 14));
+
+   assert(slot < descs->num_elements);
+   pipe_resource_reference(&buffers->buffers[slot], NULL);
+
+   if (buffer) {
+      uint64_t va;
+
+      va = si_resource(buffer)->gpu_address + offset;
+
+      switch (element_size) {
+      default:
+         assert(!"Unsupported ring buffer element size");
+      case 0:
+      case 2:
+         element_size = 0;
+         break;
+      case 4:
+         element_size = 1;
+         break;
+      case 8:
+         element_size = 2;
+         break;
+      case 16:
+         element_size = 3;
+         break;
+      }
+
+      switch (index_stride) {
+      default:
+         assert(!"Unsupported ring buffer index stride");
+      case 0:
+      case 8:
+         index_stride = 0;
+         break;
+      case 16:
+         index_stride = 1;
+         break;
+      case 32:
+         index_stride = 2;
+         break;
+      case 64:
+         index_stride = 3;
+         break;
+      }
+
+      if (sctx->chip_class >= GFX8 && stride)
+         num_records *= stride;
+
+      /* Set the descriptor. */
+      uint32_t *desc = descs->list + slot * 4;
+      desc[0] = va;
+      desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride) |
+                S_008F04_SWIZZLE_ENABLE(swizzle);
+      desc[2] = num_records;
+      desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+                S_008F0C_INDEX_STRIDE(index_stride) | S_008F0C_ADD_TID_ENABLE(add_tid);
+
+      if (sctx->chip_class >= GFX9)
+         assert(!swizzle || element_size == 1); /* always 4 bytes on GFX9 */
+      else
+         desc[3] |= S_008F0C_ELEMENT_SIZE(element_size);
+
+      if (sctx->chip_class >= GFX10) {
+         desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
+      } else {
+         desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+      }
+
+      pipe_resource_reference(&buffers->buffers[slot], buffer);
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(buffer), RADEON_USAGE_READWRITE,
+                                buffers->priority);
+      buffers->enabled_mask |= 1u << slot;
+   } else {
+      /* Clear the descriptor. */
+      memset(descs->list + slot * 4, 0, sizeof(uint32_t) * 4);
+      buffers->enabled_mask &= ~(1u << slot);
+   }
+
+   sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
  }
  
  /* INTERNAL CONST BUFFERS */
  
-static void si_set_polygon_stipple(struct pipe_context *ctx,
-                                  const struct pipe_poly_stipple *state)
+static void si_set_polygon_stipple(struct pipe_context *ctx, const struct pipe_poly_stipple *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct pipe_constant_buffer cb = {};
-       unsigned stipple[32];
-       int i;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_constant_buffer cb = {};
+   unsigned stipple[32];
+   int i;
  
-       for (i = 0; i < 32; i++)
-               stipple[i] = util_bitreverse(state->stipple[i]);
+   for (i = 0; i < 32; i++)
+      stipple[i] = util_bitreverse(state->stipple[i]);
  
-       cb.user_buffer = stipple;
-       cb.buffer_size = sizeof(stipple);
+   cb.user_buffer = stipple;
+   cb.buffer_size = sizeof(stipple);
  
-       si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &cb);
+   si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &cb);
  }
  
  /* TEXTURE METADATA ENABLE/DISABLE */
  
-static void
-si_resident_handles_update_needs_color_decompress(struct si_context *sctx)
+static void si_resident_handles_update_needs_color_decompress(struct si_context *sctx)
  {
-       util_dynarray_clear(&sctx->resident_tex_needs_color_decompress);
-       util_dynarray_clear(&sctx->resident_img_needs_color_decompress);
+   util_dynarray_clear(&sctx->resident_tex_needs_color_decompress);
+   util_dynarray_clear(&sctx->resident_img_needs_color_decompress);
  
-       util_dynarray_foreach(&sctx->resident_tex_handles,
-                             struct si_texture_handle *, tex_handle) {
-               struct pipe_resource *res = (*tex_handle)->view->texture;
-               struct si_texture *tex;
+   util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+      struct pipe_resource *res = (*tex_handle)->view->texture;
+      struct si_texture *tex;
  
-               if (!res || res->target == PIPE_BUFFER)
-                       continue;
+      if (!res || res->target == PIPE_BUFFER)
+         continue;
  
-               tex = (struct si_texture *)res;
-               if (!color_needs_decompression(tex))
-                       continue;
+      tex = (struct si_texture *)res;
+      if (!color_needs_decompression(tex))
+         continue;
  
-               util_dynarray_append(&sctx->resident_tex_needs_color_decompress,
-                                    struct si_texture_handle *, *tex_handle);
-       }
+      util_dynarray_append(&sctx->resident_tex_needs_color_decompress, struct si_texture_handle *,
+                           *tex_handle);
+   }
  
-       util_dynarray_foreach(&sctx->resident_img_handles,
-                             struct si_image_handle *, img_handle) {
-               struct pipe_image_view *view = &(*img_handle)->view;
-               struct pipe_resource *res = view->resource;
-               struct si_texture *tex;
+   util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+      struct pipe_image_view *view = &(*img_handle)->view;
+      struct pipe_resource *res = view->resource;
+      struct si_texture *tex;
  
-               if (!res || res->target == PIPE_BUFFER)
-                       continue;
+      if (!res || res->target == PIPE_BUFFER)
+         continue;
  
-               tex = (struct si_texture *)res;
-               if (!color_needs_decompression(tex))
-                       continue;
+      tex = (struct si_texture *)res;
+      if (!color_needs_decompression(tex))
+         continue;
  
-               util_dynarray_append(&sctx->resident_img_needs_color_decompress,
-                                    struct si_image_handle *, *img_handle);
-       }
+      util_dynarray_append(&sctx->resident_img_needs_color_decompress, struct si_image_handle *,
+                           *img_handle);
+   }
  }
  
  /* CMASK can be enabled (for fast clear) and disabled (for texture export)
@@ -1620,13 +1473,13 @@ si_resident_handles_update_needs_color_decompress(struct si_context *sctx)
   */
  void si_update_needs_color_decompress_masks(struct si_context *sctx)
  {
-       for (int i = 0; i < SI_NUM_SHADERS; ++i) {
-               si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]);
-               si_images_update_needs_color_decompress_mask(&sctx->images[i]);
-               si_update_shader_needs_decompress_mask(sctx, i);
-       }
+   for (int i = 0; i < SI_NUM_SHADERS; ++i) {
+      si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]);
+      si_images_update_needs_color_decompress_mask(&sctx->images[i]);
+      si_update_shader_needs_decompress_mask(sctx, i);
+   }
  
-       si_resident_handles_update_needs_color_decompress(sctx);
+   si_resident_handles_update_needs_color_decompress(sctx);
  }
  
  /* BUFFER DISCARD/INVALIDATION */
@@ -1634,33 +1487,27 @@ void si_update_needs_color_decompress_masks(struct si_context *sctx)
  /* Reset descriptors of buffer resources after \p buf has been invalidated.
   * If buf == NULL, reset all descriptors.
   */
-static void si_reset_buffer_resources(struct si_context *sctx,
-                                     struct si_buffer_resources *buffers,
-                                     unsigned descriptors_idx,
-                                     unsigned slot_mask,
-                                     struct pipe_resource *buf,
-                                     enum radeon_bo_priority priority)
-{
-       struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
-       unsigned mask = buffers->enabled_mask & slot_mask;
-
-       while (mask) {
-               unsigned i = u_bit_scan(&mask);
-               struct pipe_resource *buffer = buffers->buffers[i];
-
-               if (buffer && (!buf || buffer == buf)) {
-                       si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i],
-                                               descs->list + i*4);
-                       sctx->descriptors_dirty |= 1u << descriptors_idx;
-
-                       radeon_add_to_gfx_buffer_list_check_mem(sctx,
-                                                               si_resource(buffer),
-                                                               buffers->writable_mask & (1u << i) ?
-                                                                       RADEON_USAGE_READWRITE :
-                                                                       RADEON_USAGE_READ,
-                                                               priority, true);
-               }
-       }
+static void si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_resources *buffers,
+                                      unsigned descriptors_idx, unsigned slot_mask,
+                                      struct pipe_resource *buf, enum radeon_bo_priority priority)
+{
+   struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
+   unsigned mask = buffers->enabled_mask & slot_mask;
+
+   while (mask) {
+      unsigned i = u_bit_scan(&mask);
+      struct pipe_resource *buffer = buffers->buffers[i];
+
+      if (buffer && (!buf || buffer == buf)) {
+         si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], descs->list + i * 4);
+         sctx->descriptors_dirty |= 1u << descriptors_idx;
+
+         radeon_add_to_gfx_buffer_list_check_mem(
+            sctx, si_resource(buffer),
+            buffers->writable_mask & (1u << i) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ,
+            priority, true);
+      }
+   }
  }
  
  /* Update all buffer bindings where the buffer is bound, including
@@ -1671,436 +1518,389 @@ static void si_reset_buffer_resources(struct si_context *sctx,
   */
  void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
  {
-       struct si_resource *buffer = si_resource(buf);
-       unsigned i, shader;
-       unsigned num_elems = sctx->num_vertex_elements;
-
-       /* We changed the buffer, now we need to bind it where the old one
-        * was bound. This consists of 2 things:
-        *   1) Updating the resource descriptor and dirtying it.
-        *   2) Adding a relocation to the CS, so that it's usable.
-        */
-
-       /* Vertex buffers. */
-       if (!buffer) {
-               if (num_elems)
-                       sctx->vertex_buffers_dirty = true;
-       } else if (buffer->bind_history & PIPE_BIND_VERTEX_BUFFER) {
-               for (i = 0; i < num_elems; i++) {
-                       int vb = sctx->vertex_elements->vertex_buffer_index[i];
-
-                       if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
-                               continue;
-                       if (!sctx->vertex_buffer[vb].buffer.resource)
-                               continue;
-
-                       if (sctx->vertex_buffer[vb].buffer.resource == buf) {
-                               sctx->vertex_buffers_dirty = true;
-                               break;
-                       }
-               }
-       }
-
-       /* Streamout buffers. (other internal buffers can't be invalidated) */
-       if (!buffer || buffer->bind_history & PIPE_BIND_STREAM_OUTPUT) {
-               for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) {
-                       struct si_buffer_resources *buffers = &sctx->rw_buffers;
-                       struct si_descriptors *descs =
-                               &sctx->descriptors[SI_DESCS_RW_BUFFERS];
-                       struct pipe_resource *buffer = buffers->buffers[i];
-
-                       if (!buffer || (buf && buffer != buf))
-                               continue;
-
-                       si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i],
-                                               descs->list + i*4);
-                       sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
-
-                       radeon_add_to_gfx_buffer_list_check_mem(sctx,
-                                                               si_resource(buffer),
-                                                               RADEON_USAGE_WRITE,
-                                                               RADEON_PRIO_SHADER_RW_BUFFER,
-                                                               true);
-
-                       /* Update the streamout state. */
-                       if (sctx->streamout.begin_emitted)
-                               si_emit_streamout_end(sctx);
-                       sctx->streamout.append_bitmask =
-                                       sctx->streamout.enabled_mask;
-                       si_streamout_buffers_dirty(sctx);
-               }
-       }
-
-       /* Constant and shader buffers. */
-       if (!buffer || buffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
-               for (shader = 0; shader < SI_NUM_SHADERS; shader++)
-                       si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
-                                                 si_const_and_shader_buffer_descriptors_idx(shader),
-                                                 u_bit_consecutive(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS),
-                                                 buf,
-                                                 sctx->const_and_shader_buffers[shader].priority_constbuf);
-       }
-
-       if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) {
-               for (shader = 0; shader < SI_NUM_SHADERS; shader++)
-                       si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
-                                                 si_const_and_shader_buffer_descriptors_idx(shader),
-                                                 u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS),
-                                                 buf,
-                                                 sctx->const_and_shader_buffers[shader].priority);
-       }
-
-       if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) {
-               /* Texture buffers - update bindings. */
-               for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
-                       struct si_samplers *samplers = &sctx->samplers[shader];
-                       struct si_descriptors *descs =
-                               si_sampler_and_image_descriptors(sctx, shader);
-                       unsigned mask = samplers->enabled_mask;
-
-                       while (mask) {
-                               unsigned i = u_bit_scan(&mask);
-                               struct pipe_resource *buffer = samplers->views[i]->texture;
-
-                               if (buffer && buffer->target == PIPE_BUFFER &&
-                                   (!buf || buffer == buf)) {
-                                       unsigned desc_slot = si_get_sampler_slot(i);
-
-                                       si_set_buf_desc_address(si_resource(buffer),
-                                                               samplers->views[i]->u.buf.offset,
-                                                               descs->list + desc_slot * 16 + 4);
-                                       sctx->descriptors_dirty |=
-                                               1u << si_sampler_and_image_descriptors_idx(shader);
-
-                                       radeon_add_to_gfx_buffer_list_check_mem(
-                                               sctx, si_resource(buffer),
-                                               RADEON_USAGE_READ,
-                                               RADEON_PRIO_SAMPLER_BUFFER, true);
-                               }
-                       }
-               }
-       }
-
-       /* Shader images */
-       if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_IMAGE) {
-               for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {
-                       struct si_images *images = &sctx->images[shader];
-                       struct si_descriptors *descs =
-                               si_sampler_and_image_descriptors(sctx, shader);
-                       unsigned mask = images->enabled_mask;
-
-                       while (mask) {
-                               unsigned i = u_bit_scan(&mask);
-                               struct pipe_resource *buffer = images->views[i].resource;
-
-                               if (buffer && buffer->target == PIPE_BUFFER &&
-                                   (!buf || buffer == buf)) {
-                                       unsigned desc_slot = si_get_image_slot(i);
-
-                                       if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE)
-                                               si_mark_image_range_valid(&images->views[i]);
-
-                                       si_set_buf_desc_address(si_resource(buffer),
-                                                               images->views[i].u.buf.offset,
-                                                               descs->list + desc_slot * 8 + 4);
-                                       sctx->descriptors_dirty |=
-                                               1u << si_sampler_and_image_descriptors_idx(shader);
-
-                                       radeon_add_to_gfx_buffer_list_check_mem(
-                                               sctx, si_resource(buffer),
-                                               RADEON_USAGE_READWRITE,
-                                               RADEON_PRIO_SAMPLER_BUFFER, true);
-                               }
-                       }
-               }
-       }
-
-       /* Bindless texture handles */
-       if (!buffer || buffer->texture_handle_allocated) {
-               struct si_descriptors *descs = &sctx->bindless_descriptors;
-
-               util_dynarray_foreach(&sctx->resident_tex_handles,
-                                     struct si_texture_handle *, tex_handle) {
-                       struct pipe_sampler_view *view = (*tex_handle)->view;
-                       unsigned desc_slot = (*tex_handle)->desc_slot;
-                       struct pipe_resource *buffer = view->texture;
-
-                       if (buffer && buffer->target == PIPE_BUFFER &&
-                           (!buf || buffer == buf)) {
-                               si_set_buf_desc_address(si_resource(buffer),
-                                                       view->u.buf.offset,
-                                                       descs->list +
-                                                       desc_slot * 16 + 4);
-
-                               (*tex_handle)->desc_dirty = true;
-                               sctx->bindless_descriptors_dirty = true;
-
-                               radeon_add_to_gfx_buffer_list_check_mem(
-                                       sctx, si_resource(buffer),
-                                       RADEON_USAGE_READ,
-                                       RADEON_PRIO_SAMPLER_BUFFER, true);
-                       }
-               }
-       }
-
-       /* Bindless image handles */
-       if (!buffer || buffer->image_handle_allocated) {
-               struct si_descriptors *descs = &sctx->bindless_descriptors;
-
-               util_dynarray_foreach(&sctx->resident_img_handles,
-                                     struct si_image_handle *, img_handle) {
-                       struct pipe_image_view *view = &(*img_handle)->view;
-                       unsigned desc_slot = (*img_handle)->desc_slot;
-                       struct pipe_resource *buffer = view->resource;
-
-                       if (buffer && buffer->target == PIPE_BUFFER &&
-                           (!buf || buffer == buf)) {
-                               if (view->access & PIPE_IMAGE_ACCESS_WRITE)
-                                       si_mark_image_range_valid(view);
-
-                               si_set_buf_desc_address(si_resource(buffer),
-                                                       view->u.buf.offset,
-                                                       descs->list +
-                                                       desc_slot * 16 + 4);
-
-                               (*img_handle)->desc_dirty = true;
-                               sctx->bindless_descriptors_dirty = true;
-
-                               radeon_add_to_gfx_buffer_list_check_mem(
-                                       sctx, si_resource(buffer),
-                                       RADEON_USAGE_READWRITE,
-                                       RADEON_PRIO_SAMPLER_BUFFER, true);
-                       }
-               }
-       }
-
-       if (buffer) {
-               /* Do the same for other contexts. They will invoke this function
-                * with buffer == NULL.
-                */
-               unsigned new_counter = p_atomic_inc_return(&sctx->screen->dirty_buf_counter);
-
-               /* Skip the update for the current context, because we have already updated
-                * the buffer bindings.
-                */
-               if (new_counter == sctx->last_dirty_buf_counter + 1)
-                       sctx->last_dirty_buf_counter = new_counter;
-       }
-}
-
-static void si_upload_bindless_descriptor(struct si_context *sctx,
-                                         unsigned desc_slot,
-                                         unsigned num_dwords)
-{
-       struct si_descriptors *desc = &sctx->bindless_descriptors;
-       unsigned desc_slot_offset = desc_slot * 16;
-       uint32_t *data;
-       uint64_t va;
-
-       data = desc->list + desc_slot_offset;
-       va = desc->gpu_address + desc_slot_offset * 4;
-
-       si_cp_write_data(sctx, desc->buffer, va - desc->buffer->gpu_address,
-                        num_dwords * 4, V_370_TC_L2, V_370_ME, data);
+   struct si_resource *buffer = si_resource(buf);
+   unsigned i, shader;
+   unsigned num_elems = sctx->num_vertex_elements;
+
+   /* We changed the buffer, now we need to bind it where the old one
+    * was bound. This consists of 2 things:
+    *   1) Updating the resource descriptor and dirtying it.
+    *   2) Adding a relocation to the CS, so that it's usable.
+    */
+
+   /* Vertex buffers. */
+   if (!buffer) {
+      if (num_elems)
+         sctx->vertex_buffers_dirty = true;
+   } else if (buffer->bind_history & PIPE_BIND_VERTEX_BUFFER) {
+      for (i = 0; i < num_elems; i++) {
+         int vb = sctx->vertex_elements->vertex_buffer_index[i];
+
+         if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
+            continue;
+         if (!sctx->vertex_buffer[vb].buffer.resource)
+            continue;
+
+         if (sctx->vertex_buffer[vb].buffer.resource == buf) {
+            sctx->vertex_buffers_dirty = true;
+            break;
+         }
+      }
+   }
+
+   /* Streamout buffers. (other internal buffers can't be invalidated) */
+   if (!buffer || buffer->bind_history & PIPE_BIND_STREAM_OUTPUT) {
+      for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) {
+         struct si_buffer_resources *buffers = &sctx->rw_buffers;
+         struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
+         struct pipe_resource *buffer = buffers->buffers[i];
+
+         if (!buffer || (buf && buffer != buf))
+            continue;
+
+         si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], descs->list + i * 4);
+         sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
+
+         radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_WRITE,
+                                                 RADEON_PRIO_SHADER_RW_BUFFER, true);
+
+         /* Update the streamout state. */
+         if (sctx->streamout.begin_emitted)
+            si_emit_streamout_end(sctx);
+         sctx->streamout.append_bitmask = sctx->streamout.enabled_mask;
+         si_streamout_buffers_dirty(sctx);
+      }
+   }
+
+   /* Constant and shader buffers. */
+   if (!buffer || buffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
+      for (shader = 0; shader < SI_NUM_SHADERS; shader++)
+         si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
+                                   si_const_and_shader_buffer_descriptors_idx(shader),
+                                   u_bit_consecutive(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS),
+                                   buf, sctx->const_and_shader_buffers[shader].priority_constbuf);
+   }
+
+   if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) {
+      for (shader = 0; shader < SI_NUM_SHADERS; shader++)
+         si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
+                                   si_const_and_shader_buffer_descriptors_idx(shader),
+                                   u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS), buf,
+                                   sctx->const_and_shader_buffers[shader].priority);
+   }
+
+   if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) {
+      /* Texture buffers - update bindings. */
+      for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
+         struct si_samplers *samplers = &sctx->samplers[shader];
+         struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
+         unsigned mask = samplers->enabled_mask;
+
+         while (mask) {
+            unsigned i = u_bit_scan(&mask);
+            struct pipe_resource *buffer = samplers->views[i]->texture;
+
+            if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) {
+               unsigned desc_slot = si_get_sampler_slot(i);
+
+               si_set_buf_desc_address(si_resource(buffer), samplers->views[i]->u.buf.offset,
+                                       descs->list + desc_slot * 16 + 4);
+               sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+
+               radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_READ,
+                                                       RADEON_PRIO_SAMPLER_BUFFER, true);
+            }
+         }
+      }
+   }
+
+   /* Shader images */
+   if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_IMAGE) {
+      for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {
+         struct si_images *images = &sctx->images[shader];
+         struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
+         unsigned mask = images->enabled_mask;
+
+         while (mask) {
+            unsigned i = u_bit_scan(&mask);
+            struct pipe_resource *buffer = images->views[i].resource;
+
+            if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) {
+               unsigned desc_slot = si_get_image_slot(i);
+
+               if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE)
+                  si_mark_image_range_valid(&images->views[i]);
+
+               si_set_buf_desc_address(si_resource(buffer), images->views[i].u.buf.offset,
+                                       descs->list + desc_slot * 8 + 4);
+               sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+
+               radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer),
+                                                       RADEON_USAGE_READWRITE,
+                                                       RADEON_PRIO_SAMPLER_BUFFER, true);
+            }
+         }
+      }
+   }
+
+   /* Bindless texture handles */
+   if (!buffer || buffer->texture_handle_allocated) {
+      struct si_descriptors *descs = &sctx->bindless_descriptors;
+
+      util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+         struct pipe_sampler_view *view = (*tex_handle)->view;
+         unsigned desc_slot = (*tex_handle)->desc_slot;
+         struct pipe_resource *buffer = view->texture;
+
+         if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) {
+            si_set_buf_desc_address(si_resource(buffer), view->u.buf.offset,
+                                    descs->list + desc_slot * 16 + 4);
+
+            (*tex_handle)->desc_dirty = true;
+            sctx->bindless_descriptors_dirty = true;
+
+            radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_READ,
+                                                    RADEON_PRIO_SAMPLER_BUFFER, true);
+         }
+      }
+   }
+
+   /* Bindless image handles */
+   if (!buffer || buffer->image_handle_allocated) {
+      struct si_descriptors *descs = &sctx->bindless_descriptors;
+
+      util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+         struct pipe_image_view *view = &(*img_handle)->view;
+         unsigned desc_slot = (*img_handle)->desc_slot;
+         struct pipe_resource *buffer = view->resource;
+
+         if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) {
+            if (view->access & PIPE_IMAGE_ACCESS_WRITE)
+               si_mark_image_range_valid(view);
+
+            si_set_buf_desc_address(si_resource(buffer), view->u.buf.offset,
+                                    descs->list + desc_slot * 16 + 4);
+
+            (*img_handle)->desc_dirty = true;
+            sctx->bindless_descriptors_dirty = true;
+
+            radeon_add_to_gfx_buffer_list_check_mem(
+               sctx, si_resource(buffer), RADEON_USAGE_READWRITE, RADEON_PRIO_SAMPLER_BUFFER, true);
+         }
+      }
+   }
+
+   if (buffer) {
+      /* Do the same for other contexts. They will invoke this function
+       * with buffer == NULL.
+       */
+      unsigned new_counter = p_atomic_inc_return(&sctx->screen->dirty_buf_counter);
+
+      /* Skip the update for the current context, because we have already updated
+       * the buffer bindings.
+       */
+      if (new_counter == sctx->last_dirty_buf_counter + 1)
+         sctx->last_dirty_buf_counter = new_counter;
+   }
+}
+
+static void si_upload_bindless_descriptor(struct si_context *sctx, unsigned desc_slot,
+                                          unsigned num_dwords)
+{
+   struct si_descriptors *desc = &sctx->bindless_descriptors;
+   unsigned desc_slot_offset = desc_slot * 16;
+   uint32_t *data;
+   uint64_t va;
+
+   data = desc->list + desc_slot_offset;
+   va = desc->gpu_address + desc_slot_offset * 4;
+
+   si_cp_write_data(sctx, desc->buffer, va - desc->buffer->gpu_address, num_dwords * 4, V_370_TC_L2,
+                    V_370_ME, data);
  }
  
  static void si_upload_bindless_descriptors(struct si_context *sctx)
  {
-       if (!sctx->bindless_descriptors_dirty)
-               return;
+   if (!sctx->bindless_descriptors_dirty)
+      return;
  
-       /* Wait for graphics/compute to be idle before updating the resident
-        * descriptors directly in memory, in case the GPU is using them.
-        */
-       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-                        SI_CONTEXT_CS_PARTIAL_FLUSH;
-       sctx->emit_cache_flush(sctx);
+   /* Wait for graphics/compute to be idle before updating the resident
+    * descriptors directly in memory, in case the GPU is using them.
+    */
+   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+   sctx->emit_cache_flush(sctx);
  
-       util_dynarray_foreach(&sctx->resident_tex_handles,
-                             struct si_texture_handle *, tex_handle) {
-               unsigned desc_slot = (*tex_handle)->desc_slot;
+   util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+      unsigned desc_slot = (*tex_handle)->desc_slot;
  
-               if (!(*tex_handle)->desc_dirty)
-                       continue;
+      if (!(*tex_handle)->desc_dirty)
+         continue;
  
-               si_upload_bindless_descriptor(sctx, desc_slot, 16);
-               (*tex_handle)->desc_dirty = false;
-       }
+      si_upload_bindless_descriptor(sctx, desc_slot, 16);
+      (*tex_handle)->desc_dirty = false;
+   }
  
-       util_dynarray_foreach(&sctx->resident_img_handles,
-                             struct si_image_handle *, img_handle) {
-               unsigned desc_slot = (*img_handle)->desc_slot;
+   util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+      unsigned desc_slot = (*img_handle)->desc_slot;
  
-               if (!(*img_handle)->desc_dirty)
-                       continue;
+      if (!(*img_handle)->desc_dirty)
+         continue;
  
-               si_upload_bindless_descriptor(sctx, desc_slot, 8);
-               (*img_handle)->desc_dirty = false;
-       }
+      si_upload_bindless_descriptor(sctx, desc_slot, 8);
+      (*img_handle)->desc_dirty = false;
+   }
  
-       /* Invalidate L1 because it doesn't know that L2 changed. */
-       sctx->flags |= SI_CONTEXT_INV_SCACHE;
-       sctx->emit_cache_flush(sctx);
+   /* Invalidate L1 because it doesn't know that L2 changed. */
+   sctx->flags |= SI_CONTEXT_INV_SCACHE;
+   sctx->emit_cache_flush(sctx);
  
-       sctx->bindless_descriptors_dirty = false;
+   sctx->bindless_descriptors_dirty = false;
  }
  
  /* Update mutable image descriptor fields of all resident textures. */
  static void si_update_bindless_texture_descriptor(struct si_context *sctx,
-                                                 struct si_texture_handle *tex_handle)
+                                                  struct si_texture_handle *tex_handle)
  {
-       struct si_sampler_view *sview = (struct si_sampler_view *)tex_handle->view;
-       struct si_descriptors *desc = &sctx->bindless_descriptors;
-       unsigned desc_slot_offset = tex_handle->desc_slot * 16;
-       uint32_t desc_list[16];
+   struct si_sampler_view *sview = (struct si_sampler_view *)tex_handle->view;
+   struct si_descriptors *desc = &sctx->bindless_descriptors;
+   unsigned desc_slot_offset = tex_handle->desc_slot * 16;
+   uint32_t desc_list[16];
  
-       if (sview->base.texture->target == PIPE_BUFFER)
-               return;
+   if (sview->base.texture->target == PIPE_BUFFER)
+      return;
  
-       memcpy(desc_list, desc->list + desc_slot_offset, sizeof(desc_list));
-       si_set_sampler_view_desc(sctx, sview, &tex_handle->sstate,
-                                desc->list + desc_slot_offset);
+   memcpy(desc_list, desc->list + desc_slot_offset, sizeof(desc_list));
+   si_set_sampler_view_desc(sctx, sview, &tex_handle->sstate, desc->list + desc_slot_offset);
  
-       if (memcmp(desc_list, desc->list + desc_slot_offset,
-                  sizeof(desc_list))) {
-               tex_handle->desc_dirty = true;
-               sctx->bindless_descriptors_dirty = true;
-       }
+   if (memcmp(desc_list, desc->list + desc_slot_offset, sizeof(desc_list))) {
+      tex_handle->desc_dirty = true;
+      sctx->bindless_descriptors_dirty = true;
+   }
  }
  
  static void si_update_bindless_image_descriptor(struct si_context *sctx,
-                                               struct si_image_handle *img_handle)
+                                                struct si_image_handle *img_handle)
  {
-       struct si_descriptors *desc = &sctx->bindless_descriptors;
-       unsigned desc_slot_offset = img_handle->desc_slot * 16;
-       struct pipe_image_view *view = &img_handle->view;
-       struct pipe_resource *res = view->resource;
-       uint32_t image_desc[16];
-       unsigned desc_size = (res->nr_samples >= 2 ? 16 : 8) * 4;
+   struct si_descriptors *desc = &sctx->bindless_descriptors;
+   unsigned desc_slot_offset = img_handle->desc_slot * 16;
+   struct pipe_image_view *view = &img_handle->view;
+   struct pipe_resource *res = view->resource;
+   uint32_t image_desc[16];
+   unsigned desc_size = (res->nr_samples >= 2 ? 16 : 8) * 4;
  
-       if (res->target == PIPE_BUFFER)
-               return;
+   if (res->target == PIPE_BUFFER)
+      return;
  
-       memcpy(image_desc, desc->list + desc_slot_offset, desc_size);
-       si_set_shader_image_desc(sctx, view, true,
-                                desc->list + desc_slot_offset,
-                                desc->list + desc_slot_offset + 8);
+   memcpy(image_desc, desc->list + desc_slot_offset, desc_size);
+   si_set_shader_image_desc(sctx, view, true, desc->list + desc_slot_offset,
+                            desc->list + desc_slot_offset + 8);
  
-       if (memcmp(image_desc, desc->list + desc_slot_offset, desc_size)) {
-               img_handle->desc_dirty = true;
-               sctx->bindless_descriptors_dirty = true;
-       }
+   if (memcmp(image_desc, desc->list + desc_slot_offset, desc_size)) {
+      img_handle->desc_dirty = true;
+      sctx->bindless_descriptors_dirty = true;
+   }
  }
  
  static void si_update_all_resident_texture_descriptors(struct si_context *sctx)
  {
-       util_dynarray_foreach(&sctx->resident_tex_handles,
-                             struct si_texture_handle *, tex_handle) {
-               si_update_bindless_texture_descriptor(sctx, *tex_handle);
-       }
+   util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+      si_update_bindless_texture_descriptor(sctx, *tex_handle);
+   }
  
-       util_dynarray_foreach(&sctx->resident_img_handles,
-                             struct si_image_handle *, img_handle) {
-               si_update_bindless_image_descriptor(sctx, *img_handle);
-       }
+   util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+      si_update_bindless_image_descriptor(sctx, *img_handle);
+   }
  
-       si_upload_bindless_descriptors(sctx);
+   si_upload_bindless_descriptors(sctx);
  }
  
  /* Update mutable image descriptor fields of all bound textures. */
  void si_update_all_texture_descriptors(struct si_context *sctx)
  {
-       unsigned shader;
+   unsigned shader;
  
-       for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
-               struct si_samplers *samplers = &sctx->samplers[shader];
-               struct si_images *images = &sctx->images[shader];
-               unsigned mask;
+   for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
+      struct si_samplers *samplers = &sctx->samplers[shader];
+      struct si_images *images = &sctx->images[shader];
+      unsigned mask;
  
-               /* Images. */
-               mask = images->enabled_mask;
-               while (mask) {
-                       unsigned i = u_bit_scan(&mask);
-                       struct pipe_image_view *view = &images->views[i];
+      /* Images. */
+      mask = images->enabled_mask;
+      while (mask) {
+         unsigned i = u_bit_scan(&mask);
+         struct pipe_image_view *view = &images->views[i];
  
-                       if (!view->resource ||
-                           view->resource->target == PIPE_BUFFER)
-                               continue;
+         if (!view->resource || view->resource->target == PIPE_BUFFER)
+            continue;
  
-                       si_set_shader_image(sctx, shader, i, view, true);
-               }
+         si_set_shader_image(sctx, shader, i, view, true);
+      }
  
-               /* Sampler views. */
-               mask = samplers->enabled_mask;
-               while (mask) {
-                       unsigned i = u_bit_scan(&mask);
-                       struct pipe_sampler_view *view = samplers->views[i];
+      /* Sampler views. */
+      mask = samplers->enabled_mask;
+      while (mask) {
+         unsigned i = u_bit_scan(&mask);
+         struct pipe_sampler_view *view = samplers->views[i];
  
-                       if (!view ||
-                           !view->texture ||
-                           view->texture->target == PIPE_BUFFER)
-                               continue;
+         if (!view || !view->texture || view->texture->target == PIPE_BUFFER)
+            continue;
  
-                       si_set_sampler_view(sctx, shader, i,
-                                           samplers->views[i], true);
-               }
+         si_set_sampler_view(sctx, shader, i, samplers->views[i], true);
+      }
  
-               si_update_shader_needs_decompress_mask(sctx, shader);
-       }
+      si_update_shader_needs_decompress_mask(sctx, shader);
+   }
  
-       si_update_all_resident_texture_descriptors(sctx);
-       si_update_ps_colorbuf0_slot(sctx);
+   si_update_all_resident_texture_descriptors(sctx);
+   si_update_ps_colorbuf0_slot(sctx);
  }
  
  /* SHADER USER DATA */
  
-static void si_mark_shader_pointers_dirty(struct si_context *sctx,
-                                         unsigned shader)
+static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shader)
  {
-       sctx->shader_pointers_dirty |=
-               u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS,
-                                 SI_NUM_SHADER_DESCS);
+   sctx->shader_pointers_dirty |=
+      u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, SI_NUM_SHADER_DESCS);
  
-       if (shader == PIPE_SHADER_VERTEX) {
-               sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
-               sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
-                                                      sctx->screen->num_vbos_in_user_sgprs;
-       }
+   if (shader == PIPE_SHADER_VERTEX) {
+      sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+      sctx->vertex_buffer_user_sgprs_dirty =
+         sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs;
+   }
  
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
  }
  
  static void si_shader_pointers_begin_new_cs(struct si_context *sctx)
  {
-       sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
-       sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
-       sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
-                                              sctx->screen->num_vbos_in_user_sgprs;
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
-       sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
-       sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
+   sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
+   sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+   sctx->vertex_buffer_user_sgprs_dirty =
+      sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+   sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
+   sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
  }
  
  /* Set a base register address for user data constants in the given shader.
   * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*.
   */
-static void si_set_user_data_base(struct si_context *sctx,
-                                 unsigned shader, uint32_t new_base)
+static void si_set_user_data_base(struct si_context *sctx, unsigned shader, uint32_t new_base)
  {
-       uint32_t *base = &sctx->shader_pointers.sh_base[shader];
+   uint32_t *base = &sctx->shader_pointers.sh_base[shader];
  
-       if (*base != new_base) {
-               *base = new_base;
+   if (*base != new_base) {
+      *base = new_base;
  
-               if (new_base)
-                       si_mark_shader_pointers_dirty(sctx, shader);
+      if (new_base)
+         si_mark_shader_pointers_dirty(sctx, shader);
  
-               /* Any change in enabled shader stages requires re-emitting
-                * the VS state SGPR, because it contains the clamp_vertex_color
-                * state, which can be done in VS, TES, and GS.
-                */
-               sctx->last_vs_state = ~0;
-       }
+      /* Any change in enabled shader stages requires re-emitting
+       * the VS state SGPR, because it contains the clamp_vertex_color
+       * state, which can be done in VS, TES, and GS.
+       */
+      sctx->last_vs_state = ~0;
+   }
  }
  
  /* This must be called when these are changed between enabled and disabled
@@ -2110,922 +1910,822 @@ static void si_set_user_data_base(struct si_context *sctx,
   */
  void si_shader_change_notify(struct si_context *sctx)
  {
-       /* VS can be bound as VS, ES, or LS. */
-       if (sctx->tes_shader.cso) {
-               if (sctx->chip_class >= GFX10) {
-                       si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
-                                             R_00B430_SPI_SHADER_USER_DATA_HS_0);
-               } else if (sctx->chip_class == GFX9) {
-                       si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
-                                             R_00B430_SPI_SHADER_USER_DATA_LS_0);
-               } else {
-                       si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
-                                             R_00B530_SPI_SHADER_USER_DATA_LS_0);
-               }
-       } else if (sctx->chip_class >= GFX10) {
-               if (sctx->ngg || sctx->gs_shader.cso) {
-                       si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
-                                             R_00B230_SPI_SHADER_USER_DATA_GS_0);
-               } else {
-                       si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
-                                             R_00B130_SPI_SHADER_USER_DATA_VS_0);
-               }
-       } else if (sctx->gs_shader.cso) {
-               si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
-                                     R_00B330_SPI_SHADER_USER_DATA_ES_0);
-       } else {
-               si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
-                                     R_00B130_SPI_SHADER_USER_DATA_VS_0);
-       }
-
-       /* TES can be bound as ES, VS, or not bound. */
-       if (sctx->tes_shader.cso) {
-               if (sctx->chip_class >= GFX10) {
-                       if (sctx->ngg || sctx->gs_shader.cso) {
-                               si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
-                                                     R_00B230_SPI_SHADER_USER_DATA_GS_0);
-                       } else {
-                               si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
-                                                     R_00B130_SPI_SHADER_USER_DATA_VS_0);
-                       }
-               } else if (sctx->gs_shader.cso) {
-                       si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
-                                             R_00B330_SPI_SHADER_USER_DATA_ES_0);
-               } else {
-                       si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
-                                             R_00B130_SPI_SHADER_USER_DATA_VS_0);
-               }
-       } else {
-               si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
-       }
-}
-
-static void si_emit_shader_pointer_head(struct radeon_cmdbuf *cs,
-                                       unsigned sh_offset,
-                                       unsigned pointer_count)
-{
-       radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count, 0));
-       radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2);
-}
-
-static void si_emit_shader_pointer_body(struct si_screen *sscreen,
-                                       struct radeon_cmdbuf *cs,
-                                       uint64_t va)
-{
-       radeon_emit(cs, va);
-
-       assert(va == 0 || (va >> 32) == sscreen->info.address32_hi);
-}
-
-static void si_emit_shader_pointer(struct si_context *sctx,
-                                  struct si_descriptors *desc,
-                                  unsigned sh_base)
-{
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       unsigned sh_offset = sh_base + desc->shader_userdata_offset;
-
-       si_emit_shader_pointer_head(cs, sh_offset, 1);
-       si_emit_shader_pointer_body(sctx->screen, cs, desc->gpu_address);
-}
-
-static void si_emit_consecutive_shader_pointers(struct si_context *sctx,
-                                               unsigned pointer_mask,
-                                               unsigned sh_base)
-{
-       if (!sh_base)
-               return;
-
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       unsigned mask = sctx->shader_pointers_dirty & pointer_mask;
-
-       while (mask) {
-               int start, count;
-               u_bit_scan_consecutive_range(&mask, &start, &count);
-
-               struct si_descriptors *descs = &sctx->descriptors[start];
-               unsigned sh_offset = sh_base + descs->shader_userdata_offset;
-
-               si_emit_shader_pointer_head(cs, sh_offset, count);
-               for (int i = 0; i < count; i++)
-                       si_emit_shader_pointer_body(sctx->screen, cs,
-                                                   descs[i].gpu_address);
-       }
-}
-
-static void si_emit_global_shader_pointers(struct si_context *sctx,
-                                          struct si_descriptors *descs)
-{
-       if (sctx->chip_class >= GFX10) {
-               si_emit_shader_pointer(sctx, descs,
-                                      R_00B030_SPI_SHADER_USER_DATA_PS_0);
-               /* HW VS stage only used in non-NGG mode. */
-               si_emit_shader_pointer(sctx, descs,
-                                      R_00B130_SPI_SHADER_USER_DATA_VS_0);
-               si_emit_shader_pointer(sctx, descs,
-                                      R_00B230_SPI_SHADER_USER_DATA_GS_0);
-               si_emit_shader_pointer(sctx, descs,
-                                      R_00B430_SPI_SHADER_USER_DATA_HS_0);
-               return;
-       } else if (sctx->chip_class == GFX9) {
-               /* Broadcast it to all shader stages. */
-               si_emit_shader_pointer(sctx, descs,
-                                      R_00B530_SPI_SHADER_USER_DATA_COMMON_0);
-               return;
-       }
-
-       si_emit_shader_pointer(sctx, descs,
-                              R_00B030_SPI_SHADER_USER_DATA_PS_0);
-       si_emit_shader_pointer(sctx, descs,
-                              R_00B130_SPI_SHADER_USER_DATA_VS_0);
-       si_emit_shader_pointer(sctx, descs,
-                              R_00B330_SPI_SHADER_USER_DATA_ES_0);
-       si_emit_shader_pointer(sctx, descs,
-                              R_00B230_SPI_SHADER_USER_DATA_GS_0);
-       si_emit_shader_pointer(sctx, descs,
-                              R_00B430_SPI_SHADER_USER_DATA_HS_0);
-       si_emit_shader_pointer(sctx, descs,
-                              R_00B530_SPI_SHADER_USER_DATA_LS_0);
+   /* VS can be bound as VS, ES, or LS. */
+   if (sctx->tes_shader.cso) {
+      if (sctx->chip_class >= GFX10) {
+         si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+      } else if (sctx->chip_class == GFX9) {
+         si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B430_SPI_SHADER_USER_DATA_LS_0);
+      } else {
+         si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B530_SPI_SHADER_USER_DATA_LS_0);
+      }
+   } else if (sctx->chip_class >= GFX10) {
+      if (sctx->ngg || sctx->gs_shader.cso) {
+         si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+      } else {
+         si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+      }
+   } else if (sctx->gs_shader.cso) {
+      si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B330_SPI_SHADER_USER_DATA_ES_0);
+   } else {
+      si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+   }
+
+   /* TES can be bound as ES, VS, or not bound. */
+   if (sctx->tes_shader.cso) {
+      if (sctx->chip_class >= GFX10) {
+         if (sctx->ngg || sctx->gs_shader.cso) {
+            si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+         } else {
+            si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+         }
+      } else if (sctx->gs_shader.cso) {
+         si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B330_SPI_SHADER_USER_DATA_ES_0);
+      } else {
+         si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+      }
+   } else {
+      si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
+   }
+}
+
+static void si_emit_shader_pointer_head(struct radeon_cmdbuf *cs, unsigned sh_offset,
+                                        unsigned pointer_count)
+{
+   radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count, 0));
+   radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2);
+}
+
+static void si_emit_shader_pointer_body(struct si_screen *sscreen, struct radeon_cmdbuf *cs,
+                                        uint64_t va)
+{
+   radeon_emit(cs, va);
+
+   assert(va == 0 || (va >> 32) == sscreen->info.address32_hi);
+}
+
+static void si_emit_shader_pointer(struct si_context *sctx, struct si_descriptors *desc,
+                                   unsigned sh_base)
+{
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned sh_offset = sh_base + desc->shader_userdata_offset;
+
+   si_emit_shader_pointer_head(cs, sh_offset, 1);
+   si_emit_shader_pointer_body(sctx->screen, cs, desc->gpu_address);
+}
+
+static void si_emit_consecutive_shader_pointers(struct si_context *sctx, unsigned pointer_mask,
+                                                unsigned sh_base)
+{
+   if (!sh_base)
+      return;
+
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned mask = sctx->shader_pointers_dirty & pointer_mask;
+
+   while (mask) {
+      int start, count;
+      u_bit_scan_consecutive_range(&mask, &start, &count);
+
+      struct si_descriptors *descs = &sctx->descriptors[start];
+      unsigned sh_offset = sh_base + descs->shader_userdata_offset;
+
+      si_emit_shader_pointer_head(cs, sh_offset, count);
+      for (int i = 0; i < count; i++)
+         si_emit_shader_pointer_body(sctx->screen, cs, descs[i].gpu_address);
+   }
+}
+
+static void si_emit_global_shader_pointers(struct si_context *sctx, struct si_descriptors *descs)
+{
+   if (sctx->chip_class >= GFX10) {
+      si_emit_shader_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0);
+      /* HW VS stage only used in non-NGG mode. */
+      si_emit_shader_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+      si_emit_shader_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+      si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+      return;
+   } else if (sctx->chip_class == GFX9) {
+      /* Broadcast it to all shader stages. */
+      si_emit_shader_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_COMMON_0);
+      return;
+   }
+
+   si_emit_shader_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0);
+   si_emit_shader_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+   si_emit_shader_pointer(sctx, descs, R_00B330_SPI_SHADER_USER_DATA_ES_0);
+   si_emit_shader_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+   si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+   si_emit_shader_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_LS_0);
  }
  
  void si_emit_graphics_shader_pointers(struct si_context *sctx)
  {
-       uint32_t *sh_base = sctx->shader_pointers.sh_base;
-
-       if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) {
-               si_emit_global_shader_pointers(sctx,
-                                              &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
-       }
-
-       si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX),
-                                           sh_base[PIPE_SHADER_VERTEX]);
-       si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL),
-                                           sh_base[PIPE_SHADER_TESS_EVAL]);
-       si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT),
-                                           sh_base[PIPE_SHADER_FRAGMENT]);
-       si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL),
-                                           sh_base[PIPE_SHADER_TESS_CTRL]);
-       si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY),
-                                           sh_base[PIPE_SHADER_GEOMETRY]);
-
-       sctx->shader_pointers_dirty &=
-               ~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE);
-
-       if (sctx->vertex_buffer_pointer_dirty && sctx->num_vertex_elements) {
-               struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-               /* Find the location of the VB descriptor pointer. */
-               unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR;
-               if (sctx->chip_class >= GFX9) {
-                       if (sctx->tes_shader.cso)
-                               sh_dw_offset = GFX9_TCS_NUM_USER_SGPR;
-                       else if (sctx->gs_shader.cso)
-                               sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR;
-               }
-
-               unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4;
-               si_emit_shader_pointer_head(cs, sh_offset, 1);
-               si_emit_shader_pointer_body(sctx->screen, cs,
-                                           sctx->vb_descriptors_buffer->gpu_address +
-                                           sctx->vb_descriptors_offset);
-               sctx->vertex_buffer_pointer_dirty = false;
-       }
-
-       if (sctx->vertex_buffer_user_sgprs_dirty &&
-           sctx->num_vertex_elements &&
-           sctx->screen->num_vbos_in_user_sgprs) {
-               struct radeon_cmdbuf *cs = sctx->gfx_cs;
-               unsigned num_desc = MIN2(sctx->num_vertex_elements,
-                                        sctx->screen->num_vbos_in_user_sgprs);
-               unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4;
-
-               si_emit_shader_pointer_head(cs, sh_offset, num_desc * 4);
-               radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_desc * 4);
-               sctx->vertex_buffer_user_sgprs_dirty = false;
-       }
-
-       if (sctx->graphics_bindless_pointer_dirty) {
-               si_emit_global_shader_pointers(sctx,
-                                              &sctx->bindless_descriptors);
-               sctx->graphics_bindless_pointer_dirty = false;
-       }
+   uint32_t *sh_base = sctx->shader_pointers.sh_base;
+
+   if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) {
+      si_emit_global_shader_pointers(sctx, &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
+   }
+
+   si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX),
+                                       sh_base[PIPE_SHADER_VERTEX]);
+   si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL),
+                                       sh_base[PIPE_SHADER_TESS_EVAL]);
+   si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT),
+                                       sh_base[PIPE_SHADER_FRAGMENT]);
+   si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL),
+                                       sh_base[PIPE_SHADER_TESS_CTRL]);
+   si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY),
+                                       sh_base[PIPE_SHADER_GEOMETRY]);
+
+   sctx->shader_pointers_dirty &= ~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE);
+
+   if (sctx->vertex_buffer_pointer_dirty && sctx->num_vertex_elements) {
+      struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+      /* Find the location of the VB descriptor pointer. */
+      unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR;
+      if (sctx->chip_class >= GFX9) {
+         if (sctx->tes_shader.cso)
+            sh_dw_offset = GFX9_TCS_NUM_USER_SGPR;
+         else if (sctx->gs_shader.cso)
+            sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR;
+      }
+
+      unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4;
+      si_emit_shader_pointer_head(cs, sh_offset, 1);
+      si_emit_shader_pointer_body(
+         sctx->screen, cs, sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset);
+      sctx->vertex_buffer_pointer_dirty = false;
+   }
+
+   if (sctx->vertex_buffer_user_sgprs_dirty && sctx->num_vertex_elements &&
+       sctx->screen->num_vbos_in_user_sgprs) {
+      struct radeon_cmdbuf *cs = sctx->gfx_cs;
+      unsigned num_desc = MIN2(sctx->num_vertex_elements, sctx->screen->num_vbos_in_user_sgprs);
+      unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4;
+
+      si_emit_shader_pointer_head(cs, sh_offset, num_desc * 4);
+      radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_desc * 4);
+      sctx->vertex_buffer_user_sgprs_dirty = false;
+   }
+
+   if (sctx->graphics_bindless_pointer_dirty) {
+      si_emit_global_shader_pointers(sctx, &sctx->bindless_descriptors);
+      sctx->graphics_bindless_pointer_dirty = false;
+   }
  }
  
  void si_emit_compute_shader_pointers(struct si_context *sctx)
  {
-       unsigned base = R_00B900_COMPUTE_USER_DATA_0;
+   unsigned base = R_00B900_COMPUTE_USER_DATA_0;
  
-       si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE),
-                                           R_00B900_COMPUTE_USER_DATA_0);
-       sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE);
+   si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE),
+                                       R_00B900_COMPUTE_USER_DATA_0);
+   sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE);
  
-       if (sctx->compute_bindless_pointer_dirty) {
-               si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base);
-               sctx->compute_bindless_pointer_dirty = false;
-       }
+   if (sctx->compute_bindless_pointer_dirty) {
+      si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base);
+      sctx->compute_bindless_pointer_dirty = false;
+   }
  }
  
  /* BINDLESS */
  
-static void si_init_bindless_descriptors(struct si_context *sctx,
-                                        struct si_descriptors *desc,
-                                        short shader_userdata_rel_index,
-                                        unsigned num_elements)
+static void si_init_bindless_descriptors(struct si_context *sctx, struct si_descriptors *desc,
+                                         short shader_userdata_rel_index, unsigned num_elements)
  {
-       ASSERTED unsigned desc_slot;
+   ASSERTED unsigned desc_slot;
  
-       si_init_descriptors(desc, shader_userdata_rel_index, 16, num_elements);
-       sctx->bindless_descriptors.num_active_slots = num_elements;
+   si_init_descriptors(desc, shader_userdata_rel_index, 16, num_elements);
+   sctx->bindless_descriptors.num_active_slots = num_elements;
  
-       /* The first bindless descriptor is stored at slot 1, because 0 is not
-        * considered to be a valid handle.
-        */
-       sctx->num_bindless_descriptors = 1;
+   /* The first bindless descriptor is stored at slot 1, because 0 is not
+    * considered to be a valid handle.
+    */
+   sctx->num_bindless_descriptors = 1;
  
-       /* Track which bindless slots are used (or not). */
-       util_idalloc_init(&sctx->bindless_used_slots);
-       util_idalloc_resize(&sctx->bindless_used_slots, num_elements);
+   /* Track which bindless slots are used (or not). */
+   util_idalloc_init(&sctx->bindless_used_slots);
+   util_idalloc_resize(&sctx->bindless_used_slots, num_elements);
  
-       /* Reserve slot 0 because it's an invalid handle for bindless. */
-       desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots);
-       assert(desc_slot == 0);
+   /* Reserve slot 0 because it's an invalid handle for bindless. */
+   desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots);
+   assert(desc_slot == 0);
  }
  
  static void si_release_bindless_descriptors(struct si_context *sctx)
  {
-       si_release_descriptors(&sctx->bindless_descriptors);
-       util_idalloc_fini(&sctx->bindless_used_slots);
+   si_release_descriptors(&sctx->bindless_descriptors);
+   util_idalloc_fini(&sctx->bindless_used_slots);
  }
  
  static unsigned si_get_first_free_bindless_slot(struct si_context *sctx)
  {
-       struct si_descriptors *desc = &sctx->bindless_descriptors;
-       unsigned desc_slot;
+   struct si_descriptors *desc = &sctx->bindless_descriptors;
+   unsigned desc_slot;
  
-       desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots);
-       if (desc_slot >= desc->num_elements) {
-               /* The array of bindless descriptors is full, resize it. */
-               unsigned slot_size = desc->element_dw_size * 4;
-               unsigned new_num_elements = desc->num_elements * 2;
+   desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots);
+   if (desc_slot >= desc->num_elements) {
+      /* The array of bindless descriptors is full, resize it. */
+      unsigned slot_size = desc->element_dw_size * 4;
+      unsigned new_num_elements = desc->num_elements * 2;
  
-               desc->list = REALLOC(desc->list, desc->num_elements * slot_size,
-                                    new_num_elements * slot_size);
-               desc->num_elements = new_num_elements;
-               desc->num_active_slots = new_num_elements;
-       }
+      desc->list =
+         REALLOC(desc->list, desc->num_elements * slot_size, new_num_elements * slot_size);
+      desc->num_elements = new_num_elements;
+      desc->num_active_slots = new_num_elements;
+   }
  
-       assert(desc_slot);
-       return desc_slot;
+   assert(desc_slot);
+   return desc_slot;
  }
  
-static unsigned
-si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list,
-                             unsigned size)
+static unsigned si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list,
+                                              unsigned size)
  {
-       struct si_descriptors *desc = &sctx->bindless_descriptors;
-       unsigned desc_slot, desc_slot_offset;
+   struct si_descriptors *desc = &sctx->bindless_descriptors;
+   unsigned desc_slot, desc_slot_offset;
  
-       /* Find a free slot. */
-       desc_slot = si_get_first_free_bindless_slot(sctx);
+   /* Find a free slot. */
+   desc_slot = si_get_first_free_bindless_slot(sctx);
  
-       /* For simplicity, sampler and image bindless descriptors use fixed
-        * 16-dword slots for now. Image descriptors only need 8-dword but this
-        * doesn't really matter because no real apps use image handles.
-        */
-       desc_slot_offset = desc_slot * 16;
+   /* For simplicity, sampler and image bindless descriptors use fixed
+    * 16-dword slots for now. Image descriptors only need 8-dword but this
+    * doesn't really matter because no real apps use image handles.
+    */
+   desc_slot_offset = desc_slot * 16;
  
-       /* Copy the descriptor into the array. */
-       memcpy(desc->list + desc_slot_offset, desc_list, size);
+   /* Copy the descriptor into the array. */
+   memcpy(desc->list + desc_slot_offset, desc_list, size);
  
-       /* Re-upload the whole array of bindless descriptors into a new buffer.
-        */
-       if (!si_upload_descriptors(sctx, desc))
-               return 0;
+   /* Re-upload the whole array of bindless descriptors into a new buffer.
+    */
+   if (!si_upload_descriptors(sctx, desc))
+      return 0;
  
-       /* Make sure to re-emit the shader pointers for all stages. */
-       sctx->graphics_bindless_pointer_dirty = true;
-       sctx->compute_bindless_pointer_dirty = true;
+   /* Make sure to re-emit the shader pointers for all stages. */
+   sctx->graphics_bindless_pointer_dirty = true;
+   sctx->compute_bindless_pointer_dirty = true;
  
-       return desc_slot;
+   return desc_slot;
  }
  
-static void si_update_bindless_buffer_descriptor(struct si_context *sctx,
-                                                unsigned desc_slot,
-                                                struct pipe_resource *resource,
-                                                uint64_t offset,
-                                                bool *desc_dirty)
+static void si_update_bindless_buffer_descriptor(struct si_context *sctx, unsigned desc_slot,
+                                                 struct pipe_resource *resource, uint64_t offset,
+                                                 bool *desc_dirty)
  {
-       struct si_descriptors *desc = &sctx->bindless_descriptors;
-       struct si_resource *buf = si_resource(resource);
-       unsigned desc_slot_offset = desc_slot * 16;
-       uint32_t *desc_list = desc->list + desc_slot_offset + 4;
-       uint64_t old_desc_va;
+   struct si_descriptors *desc = &sctx->bindless_descriptors;
+   struct si_resource *buf = si_resource(resource);
+   unsigned desc_slot_offset = desc_slot * 16;
+   uint32_t *desc_list = desc->list + desc_slot_offset + 4;
+   uint64_t old_desc_va;
  
-       assert(resource->target == PIPE_BUFFER);
+   assert(resource->target == PIPE_BUFFER);
  
-       /* Retrieve the old buffer addr from the descriptor. */
-       old_desc_va = si_desc_extract_buffer_address(desc_list);
+   /* Retrieve the old buffer addr from the descriptor. */
+   old_desc_va = si_desc_extract_buffer_address(desc_list);
  
-       if (old_desc_va != buf->gpu_address + offset) {
-               /* The buffer has been invalidated when the handle wasn't
-                * resident, update the descriptor and the dirty flag.
-                */
-               si_set_buf_desc_address(buf, offset, &desc_list[0]);
+   if (old_desc_va != buf->gpu_address + offset) {
+      /* The buffer has been invalidated when the handle wasn't
+       * resident, update the descriptor and the dirty flag.
+       */
+      si_set_buf_desc_address(buf, offset, &desc_list[0]);
  
-               *desc_dirty = true;
-       }
+      *desc_dirty = true;
+   }
  }
  
-static uint64_t si_create_texture_handle(struct pipe_context *ctx,
-                                        struct pipe_sampler_view *view,
-                                        const struct pipe_sampler_state *state)
+static uint64_t si_create_texture_handle(struct pipe_context *ctx, struct pipe_sampler_view *view,
+                                         const struct pipe_sampler_state *state)
  {
-       struct si_sampler_view *sview = (struct si_sampler_view *)view;
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_texture_handle *tex_handle;
-       struct si_sampler_state *sstate;
-       uint32_t desc_list[16];
-       uint64_t handle;
+   struct si_sampler_view *sview = (struct si_sampler_view *)view;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture_handle *tex_handle;
+   struct si_sampler_state *sstate;
+   uint32_t desc_list[16];
+   uint64_t handle;
  
-       tex_handle = CALLOC_STRUCT(si_texture_handle);
-       if (!tex_handle)
-               return 0;
+   tex_handle = CALLOC_STRUCT(si_texture_handle);
+   if (!tex_handle)
+      return 0;
  
-       memset(desc_list, 0, sizeof(desc_list));
-       si_init_descriptor_list(&desc_list[0], 16, 1, null_texture_descriptor);
+   memset(desc_list, 0, sizeof(desc_list));
+   si_init_descriptor_list(&desc_list[0], 16, 1, null_texture_descriptor);
  
-       sstate = ctx->create_sampler_state(ctx, state);
-       if (!sstate) {
-               FREE(tex_handle);
-               return 0;
-       }
+   sstate = ctx->create_sampler_state(ctx, state);
+   if (!sstate) {
+      FREE(tex_handle);
+      return 0;
+   }
  
-       si_set_sampler_view_desc(sctx, sview, sstate, &desc_list[0]);
-       memcpy(&tex_handle->sstate, sstate, sizeof(*sstate));
-       ctx->delete_sampler_state(ctx, sstate);
+   si_set_sampler_view_desc(sctx, sview, sstate, &desc_list[0]);
+   memcpy(&tex_handle->sstate, sstate, sizeof(*sstate));
+   ctx->delete_sampler_state(ctx, sstate);
  
-       tex_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list,
-                                                             sizeof(desc_list));
-       if (!tex_handle->desc_slot) {
-               FREE(tex_handle);
-               return 0;
-       }
+   tex_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, sizeof(desc_list));
+   if (!tex_handle->desc_slot) {
+      FREE(tex_handle);
+      return 0;
+   }
  
-       handle = tex_handle->desc_slot;
+   handle = tex_handle->desc_slot;
  
-       if (!_mesa_hash_table_insert(sctx->tex_handles,
-                                    (void *)(uintptr_t)handle,
-                                    tex_handle)) {
-               FREE(tex_handle);
-               return 0;
-       }
+   if (!_mesa_hash_table_insert(sctx->tex_handles, (void *)(uintptr_t)handle, tex_handle)) {
+      FREE(tex_handle);
+      return 0;
+   }
  
-       pipe_sampler_view_reference(&tex_handle->view, view);
+   pipe_sampler_view_reference(&tex_handle->view, view);
  
-       si_resource(sview->base.texture)->texture_handle_allocated = true;
+   si_resource(sview->base.texture)->texture_handle_allocated = true;
  
-       return handle;
+   return handle;
  }
  
  static void si_delete_texture_handle(struct pipe_context *ctx, uint64_t handle)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_texture_handle *tex_handle;
-       struct hash_entry *entry;
-
-       entry = _mesa_hash_table_search(sctx->tex_handles,
-                                       (void *)(uintptr_t)handle);
-       if (!entry)
-               return;
-
-       tex_handle = (struct si_texture_handle *)entry->data;
-
-       /* Allow this descriptor slot to be re-used. */
-       util_idalloc_free(&sctx->bindless_used_slots, tex_handle->desc_slot);
-
-       pipe_sampler_view_reference(&tex_handle->view, NULL);
-       _mesa_hash_table_remove(sctx->tex_handles, entry);
-       FREE(tex_handle);
-}
-
-static void si_make_texture_handle_resident(struct pipe_context *ctx,
-                                           uint64_t handle, bool resident)
-{
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_texture_handle *tex_handle;
-       struct si_sampler_view *sview;
-       struct hash_entry *entry;
-
-       entry = _mesa_hash_table_search(sctx->tex_handles,
-                                       (void *)(uintptr_t)handle);
-       if (!entry)
-               return;
-
-       tex_handle = (struct si_texture_handle *)entry->data;
-       sview = (struct si_sampler_view *)tex_handle->view;
-
-       if (resident) {
-               if (sview->base.texture->target != PIPE_BUFFER) {
-                       struct si_texture *tex =
-                               (struct si_texture *)sview->base.texture;
-
-                       if (depth_needs_decompression(tex)) {
-                               util_dynarray_append(
-                                       &sctx->resident_tex_needs_depth_decompress,
-                                       struct si_texture_handle *,
-                                       tex_handle);
-                       }
-
-                       if (color_needs_decompression(tex)) {
-                               util_dynarray_append(
-                                       &sctx->resident_tex_needs_color_decompress,
-                                       struct si_texture_handle *,
-                                       tex_handle);
-                       }
-
-                       if (tex->surface.dcc_offset &&
-                           p_atomic_read(&tex->framebuffers_bound))
-                               sctx->need_check_render_feedback = true;
-
-                       si_update_bindless_texture_descriptor(sctx, tex_handle);
-               } else {
-                       si_update_bindless_buffer_descriptor(sctx,
-                                                            tex_handle->desc_slot,
-                                                            sview->base.texture,
-                                                            sview->base.u.buf.offset,
-                                                            &tex_handle->desc_dirty);
-               }
-
-               /* Re-upload the descriptor if it has been updated while it
-                * wasn't resident.
-                */
-               if (tex_handle->desc_dirty)
-                       sctx->bindless_descriptors_dirty = true;
-
-               /* Add the texture handle to the per-context list. */
-               util_dynarray_append(&sctx->resident_tex_handles,
-                                    struct si_texture_handle *, tex_handle);
-
-               /* Add the buffers to the current CS in case si_begin_new_cs()
-                * is not going to be called.
-                */
-               si_sampler_view_add_buffer(sctx, sview->base.texture,
-                                          RADEON_USAGE_READ,
-                                          sview->is_stencil_sampler, false);
-       } else {
-               /* Remove the texture handle from the per-context list. */
-               util_dynarray_delete_unordered(&sctx->resident_tex_handles,
-                                              struct si_texture_handle *,
-                                              tex_handle);
-
-               if (sview->base.texture->target != PIPE_BUFFER) {
-                       util_dynarray_delete_unordered(
-                               &sctx->resident_tex_needs_depth_decompress,
-                               struct si_texture_handle *, tex_handle);
-
-                       util_dynarray_delete_unordered(
-                               &sctx->resident_tex_needs_color_decompress,
-                               struct si_texture_handle *, tex_handle);
-               }
-       }
-}
-
-static uint64_t si_create_image_handle(struct pipe_context *ctx,
-                                      const struct pipe_image_view *view)
-{
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_image_handle *img_handle;
-       uint32_t desc_list[16];
-       uint64_t handle;
-
-       if (!view || !view->resource)
-               return 0;
-
-       img_handle = CALLOC_STRUCT(si_image_handle);
-       if (!img_handle)
-               return 0;
-
-       memset(desc_list, 0, sizeof(desc_list));
-       si_init_descriptor_list(&desc_list[0], 8, 2, null_image_descriptor);
-
-       si_set_shader_image_desc(sctx, view, false, &desc_list[0], &desc_list[8]);
-
-       img_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list,
-                                                             sizeof(desc_list));
-       if (!img_handle->desc_slot) {
-               FREE(img_handle);
-               return 0;
-       }
-
-       handle = img_handle->desc_slot;
-
-       if (!_mesa_hash_table_insert(sctx->img_handles,
-                                    (void *)(uintptr_t)handle,
-                                    img_handle)) {
-               FREE(img_handle);
-               return 0;
-       }
-
-       util_copy_image_view(&img_handle->view, view);
-
-       si_resource(view->resource)->image_handle_allocated = true;
-
-       return handle;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture_handle *tex_handle;
+   struct hash_entry *entry;
+
+   entry = _mesa_hash_table_search(sctx->tex_handles, (void *)(uintptr_t)handle);
+   if (!entry)
+      return;
+
+   tex_handle = (struct si_texture_handle *)entry->data;
+
+   /* Allow this descriptor slot to be re-used. */
+   util_idalloc_free(&sctx->bindless_used_slots, tex_handle->desc_slot);
+
+   pipe_sampler_view_reference(&tex_handle->view, NULL);
+   _mesa_hash_table_remove(sctx->tex_handles, entry);
+   FREE(tex_handle);
+}
+
+static void si_make_texture_handle_resident(struct pipe_context *ctx, uint64_t handle,
+                                            bool resident)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture_handle *tex_handle;
+   struct si_sampler_view *sview;
+   struct hash_entry *entry;
+
+   entry = _mesa_hash_table_search(sctx->tex_handles, (void *)(uintptr_t)handle);
+   if (!entry)
+      return;
+
+   tex_handle = (struct si_texture_handle *)entry->data;
+   sview = (struct si_sampler_view *)tex_handle->view;
+
+   if (resident) {
+      if (sview->base.texture->target != PIPE_BUFFER) {
+         struct si_texture *tex = (struct si_texture *)sview->base.texture;
+
+         if (depth_needs_decompression(tex)) {
+            util_dynarray_append(&sctx->resident_tex_needs_depth_decompress,
+                                 struct si_texture_handle *, tex_handle);
+         }
+
+         if (color_needs_decompression(tex)) {
+            util_dynarray_append(&sctx->resident_tex_needs_color_decompress,
+                                 struct si_texture_handle *, tex_handle);
+         }
+
+         if (tex->surface.dcc_offset && p_atomic_read(&tex->framebuffers_bound))
+            sctx->need_check_render_feedback = true;
+
+         si_update_bindless_texture_descriptor(sctx, tex_handle);
+      } else {
+         si_update_bindless_buffer_descriptor(sctx, tex_handle->desc_slot, sview->base.texture,
+                                              sview->base.u.buf.offset, &tex_handle->desc_dirty);
+      }
+
+      /* Re-upload the descriptor if it has been updated while it
+       * wasn't resident.
+       */
+      if (tex_handle->desc_dirty)
+         sctx->bindless_descriptors_dirty = true;
+
+      /* Add the texture handle to the per-context list. */
+      util_dynarray_append(&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle);
+
+      /* Add the buffers to the current CS in case si_begin_new_cs()
+       * is not going to be called.
+       */
+      si_sampler_view_add_buffer(sctx, sview->base.texture, RADEON_USAGE_READ,
+                                 sview->is_stencil_sampler, false);
+   } else {
+      /* Remove the texture handle from the per-context list. */
+      util_dynarray_delete_unordered(&sctx->resident_tex_handles, struct si_texture_handle *,
+                                     tex_handle);
+
+      if (sview->base.texture->target != PIPE_BUFFER) {
+         util_dynarray_delete_unordered(&sctx->resident_tex_needs_depth_decompress,
+                                        struct si_texture_handle *, tex_handle);
+
+         util_dynarray_delete_unordered(&sctx->resident_tex_needs_color_decompress,
+                                        struct si_texture_handle *, tex_handle);
+      }
+   }
+}
+
+static uint64_t si_create_image_handle(struct pipe_context *ctx, const struct pipe_image_view *view)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_image_handle *img_handle;
+   uint32_t desc_list[16];
+   uint64_t handle;
+
+   if (!view || !view->resource)
+      return 0;
+
+   img_handle = CALLOC_STRUCT(si_image_handle);
+   if (!img_handle)
+      return 0;
+
+   memset(desc_list, 0, sizeof(desc_list));
+   si_init_descriptor_list(&desc_list[0], 8, 2, null_image_descriptor);
+
+   si_set_shader_image_desc(sctx, view, false, &desc_list[0], &desc_list[8]);
+
+   img_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, sizeof(desc_list));
+   if (!img_handle->desc_slot) {
+      FREE(img_handle);
+      return 0;
+   }
+
+   handle = img_handle->desc_slot;
+
+   if (!_mesa_hash_table_insert(sctx->img_handles, (void *)(uintptr_t)handle, img_handle)) {
+      FREE(img_handle);
+      return 0;
+   }
+
+   util_copy_image_view(&img_handle->view, view);
+
+   si_resource(view->resource)->image_handle_allocated = true;
+
+   return handle;
  }
  
  static void si_delete_image_handle(struct pipe_context *ctx, uint64_t handle)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_image_handle *img_handle;
-       struct hash_entry *entry;
-
-       entry = _mesa_hash_table_search(sctx->img_handles,
-                                       (void *)(uintptr_t)handle);
-       if (!entry)
-               return;
-
-       img_handle = (struct si_image_handle *)entry->data;
-
-       util_copy_image_view(&img_handle->view, NULL);
-       _mesa_hash_table_remove(sctx->img_handles, entry);
-       FREE(img_handle);
-}
-
-static void si_make_image_handle_resident(struct pipe_context *ctx,
-                                         uint64_t handle, unsigned access,
-                                         bool resident)
-{
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_image_handle *img_handle;
-       struct pipe_image_view *view;
-       struct si_resource *res;
-       struct hash_entry *entry;
-
-       entry = _mesa_hash_table_search(sctx->img_handles,
-                                       (void *)(uintptr_t)handle);
-       if (!entry)
-               return;
-
-       img_handle = (struct si_image_handle *)entry->data;
-       view = &img_handle->view;
-       res = si_resource(view->resource);
-
-       if (resident) {
-               if (res->b.b.target != PIPE_BUFFER) {
-                       struct si_texture *tex = (struct si_texture *)res;
-                       unsigned level = view->u.tex.level;
-
-                       if (color_needs_decompression(tex)) {
-                               util_dynarray_append(
-                                       &sctx->resident_img_needs_color_decompress,
-                                       struct si_image_handle *,
-                                       img_handle);
-                       }
-
-                       if (vi_dcc_enabled(tex, level) &&
-                           p_atomic_read(&tex->framebuffers_bound))
-                               sctx->need_check_render_feedback = true;
-
-                       si_update_bindless_image_descriptor(sctx, img_handle);
-               } else {
-                       si_update_bindless_buffer_descriptor(sctx,
-                                                            img_handle->desc_slot,
-                                                            view->resource,
-                                                            view->u.buf.offset,
-                                                            &img_handle->desc_dirty);
-               }
-
-               /* Re-upload the descriptor if it has been updated while it
-                * wasn't resident.
-                */
-               if (img_handle->desc_dirty)
-                       sctx->bindless_descriptors_dirty = true;
-
-               /* Add the image handle to the per-context list. */
-               util_dynarray_append(&sctx->resident_img_handles,
-                                    struct si_image_handle *, img_handle);
-
-               /* Add the buffers to the current CS in case si_begin_new_cs()
-                * is not going to be called.
-                */
-               si_sampler_view_add_buffer(sctx, view->resource,
-                                          (access & PIPE_IMAGE_ACCESS_WRITE) ?
-                                          RADEON_USAGE_READWRITE :
-                                          RADEON_USAGE_READ, false, false);
-       } else {
-               /* Remove the image handle from the per-context list. */
-               util_dynarray_delete_unordered(&sctx->resident_img_handles,
-                                              struct si_image_handle *,
-                                              img_handle);
-
-               if (res->b.b.target != PIPE_BUFFER) {
-                       util_dynarray_delete_unordered(
-                               &sctx->resident_img_needs_color_decompress,
-                               struct si_image_handle *,
-                               img_handle);
-               }
-       }
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_image_handle *img_handle;
+   struct hash_entry *entry;
+
+   entry = _mesa_hash_table_search(sctx->img_handles, (void *)(uintptr_t)handle);
+   if (!entry)
+      return;
+
+   img_handle = (struct si_image_handle *)entry->data;
+
+   util_copy_image_view(&img_handle->view, NULL);
+   _mesa_hash_table_remove(sctx->img_handles, entry);
+   FREE(img_handle);
+}
+
+static void si_make_image_handle_resident(struct pipe_context *ctx, uint64_t handle,
+                                          unsigned access, bool resident)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_image_handle *img_handle;
+   struct pipe_image_view *view;
+   struct si_resource *res;
+   struct hash_entry *entry;
+
+   entry = _mesa_hash_table_search(sctx->img_handles, (void *)(uintptr_t)handle);
+   if (!entry)
+      return;
+
+   img_handle = (struct si_image_handle *)entry->data;
+   view = &img_handle->view;
+   res = si_resource(view->resource);
+
+   if (resident) {
+      if (res->b.b.target != PIPE_BUFFER) {
+         struct si_texture *tex = (struct si_texture *)res;
+         unsigned level = view->u.tex.level;
+
+         if (color_needs_decompression(tex)) {
+            util_dynarray_append(&sctx->resident_img_needs_color_decompress,
+                                 struct si_image_handle *, img_handle);
+         }
+
+         if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound))
+            sctx->need_check_render_feedback = true;
+
+         si_update_bindless_image_descriptor(sctx, img_handle);
+      } else {
+         si_update_bindless_buffer_descriptor(sctx, img_handle->desc_slot, view->resource,
+                                              view->u.buf.offset, &img_handle->desc_dirty);
+      }
+
+      /* Re-upload the descriptor if it has been updated while it
+       * wasn't resident.
+       */
+      if (img_handle->desc_dirty)
+         sctx->bindless_descriptors_dirty = true;
+
+      /* Add the image handle to the per-context list. */
+      util_dynarray_append(&sctx->resident_img_handles, struct si_image_handle *, img_handle);
+
+      /* Add the buffers to the current CS in case si_begin_new_cs()
+       * is not going to be called.
+       */
+      si_sampler_view_add_buffer(
+         sctx, view->resource,
+         (access & PIPE_IMAGE_ACCESS_WRITE) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, false,
+         false);
+   } else {
+      /* Remove the image handle from the per-context list. */
+      util_dynarray_delete_unordered(&sctx->resident_img_handles, struct si_image_handle *,
+                                     img_handle);
+
+      if (res->b.b.target != PIPE_BUFFER) {
+         util_dynarray_delete_unordered(&sctx->resident_img_needs_color_decompress,
+                                        struct si_image_handle *, img_handle);
+      }
+   }
  }
  
  static void si_resident_buffers_add_all_to_bo_list(struct si_context *sctx)
  {
-       unsigned num_resident_tex_handles, num_resident_img_handles;
+   unsigned num_resident_tex_handles, num_resident_img_handles;
  
-       num_resident_tex_handles = sctx->resident_tex_handles.size /
-                                  sizeof(struct si_texture_handle *);
-       num_resident_img_handles = sctx->resident_img_handles.size /
-                                  sizeof(struct si_image_handle *);
+   num_resident_tex_handles = sctx->resident_tex_handles.size / sizeof(struct si_texture_handle *);
+   num_resident_img_handles = sctx->resident_img_handles.size / sizeof(struct si_image_handle *);
  
-       /* Add all resident texture handles. */
-       util_dynarray_foreach(&sctx->resident_tex_handles,
-                             struct si_texture_handle *, tex_handle) {
-               struct si_sampler_view *sview =
-                       (struct si_sampler_view *)(*tex_handle)->view;
+   /* Add all resident texture handles. */
+   util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+      struct si_sampler_view *sview = (struct si_sampler_view *)(*tex_handle)->view;
  
-               si_sampler_view_add_buffer(sctx, sview->base.texture,
-                                          RADEON_USAGE_READ,
-                                          sview->is_stencil_sampler, false);
-       }
+      si_sampler_view_add_buffer(sctx, sview->base.texture, RADEON_USAGE_READ,
+                                 sview->is_stencil_sampler, false);
+   }
  
-       /* Add all resident image handles. */
-       util_dynarray_foreach(&sctx->resident_img_handles,
-                             struct si_image_handle *, img_handle) {
-               struct pipe_image_view *view = &(*img_handle)->view;
+   /* Add all resident image handles. */
+   util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+      struct pipe_image_view *view = &(*img_handle)->view;
  
-               si_sampler_view_add_buffer(sctx, view->resource,
-                                          RADEON_USAGE_READWRITE,
-                                          false, false);
-       }
+      si_sampler_view_add_buffer(sctx, view->resource, RADEON_USAGE_READWRITE, false, false);
+   }
  
-       sctx->num_resident_handles += num_resident_tex_handles +
-                                       num_resident_img_handles;
-       assert(sctx->bo_list_add_all_resident_resources);
-       sctx->bo_list_add_all_resident_resources = false;
+   sctx->num_resident_handles += num_resident_tex_handles + num_resident_img_handles;
+   assert(sctx->bo_list_add_all_resident_resources);
+   sctx->bo_list_add_all_resident_resources = false;
  }
  
  /* INIT/DEINIT/UPLOAD */
  
  void si_init_all_descriptors(struct si_context *sctx)
  {
-       int i;
-       unsigned first_shader =
-               sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE;
-
-       for (i = first_shader; i < SI_NUM_SHADERS; i++) {
-               bool is_2nd = sctx->chip_class >= GFX9 &&
-                                    (i == PIPE_SHADER_TESS_CTRL ||
-                                     i == PIPE_SHADER_GEOMETRY);
-               unsigned num_sampler_slots = SI_NUM_IMAGE_SLOTS / 2 + SI_NUM_SAMPLERS;
-               unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
-               int rel_dw_offset;
-               struct si_descriptors *desc;
-
-               if (is_2nd) {
-                       if (i == PIPE_SHADER_TESS_CTRL) {
-                               rel_dw_offset = (R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS -
-                                                R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4;
-                       } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */
-                               rel_dw_offset = (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS -
-                                                R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4;
-                       } else {
-                               rel_dw_offset = (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS -
-                                                R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4;
-                       }
-               } else {
-                       rel_dw_offset = SI_SGPR_CONST_AND_SHADER_BUFFERS;
-               }
-               desc = si_const_and_shader_buffer_descriptors(sctx, i);
-               si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc,
-                                        num_buffer_slots, rel_dw_offset,
-                                        RADEON_PRIO_SHADER_RW_BUFFER,
-                                        RADEON_PRIO_CONST_BUFFER);
-               desc->slot_index_to_bind_directly = si_get_constbuf_slot(0);
-
-               if (is_2nd) {
-                       if (i == PIPE_SHADER_TESS_CTRL) {
-                               rel_dw_offset = (R_00B40C_SPI_SHADER_USER_DATA_ADDR_HI_HS -
-                                                R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4;
-                       } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */
-                               rel_dw_offset = (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS -
-                                                R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4;
-                       } else {
-                               rel_dw_offset = (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS -
-                                                R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4;
-                       }
-               } else {
-                       rel_dw_offset = SI_SGPR_SAMPLERS_AND_IMAGES;
-               }
-
-               desc = si_sampler_and_image_descriptors(sctx, i);
-               si_init_descriptors(desc, rel_dw_offset, 16, num_sampler_slots);
-
-               int j;
-               for (j = 0; j < SI_NUM_IMAGE_SLOTS; j++)
-                       memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4);
-               for (; j < SI_NUM_IMAGE_SLOTS + SI_NUM_SAMPLERS * 2; j++)
-                       memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
-       }
-
-       si_init_buffer_resources(&sctx->rw_buffers,
-                                &sctx->descriptors[SI_DESCS_RW_BUFFERS],
-                                SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
-                                /* The second priority is used by
-                                 * const buffers in RW buffer slots. */
-                                RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER);
-       sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
-
-       /* Initialize an array of 1024 bindless descriptors, when the limit is
-        * reached, just make it larger and re-upload the whole array.
-        */
-       si_init_bindless_descriptors(sctx, &sctx->bindless_descriptors,
-                                    SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
-                                    1024);
-
-       sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
-
-       /* Set pipe_context functions. */
-       sctx->b.bind_sampler_states = si_bind_sampler_states;
-       sctx->b.set_shader_images = si_set_shader_images;
-       sctx->b.set_constant_buffer = si_pipe_set_constant_buffer;
-       sctx->b.set_shader_buffers = si_set_shader_buffers;
-       sctx->b.set_sampler_views = si_set_sampler_views;
-       sctx->b.create_texture_handle = si_create_texture_handle;
-       sctx->b.delete_texture_handle = si_delete_texture_handle;
-       sctx->b.make_texture_handle_resident = si_make_texture_handle_resident;
-       sctx->b.create_image_handle = si_create_image_handle;
-       sctx->b.delete_image_handle = si_delete_image_handle;
-       sctx->b.make_image_handle_resident = si_make_image_handle_resident;
-
-       if (!sctx->has_graphics)
-               return;
-
-       sctx->b.set_polygon_stipple = si_set_polygon_stipple;
-
-       /* Shader user data. */
-       sctx->atoms.s.shader_pointers.emit = si_emit_graphics_shader_pointers;
-
-       /* Set default and immutable mappings. */
-       if (sctx->ngg) {
-               assert(sctx->chip_class >= GFX10);
-               si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B230_SPI_SHADER_USER_DATA_GS_0);
-       } else {
-               si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
-       }
-
-       if (sctx->chip_class == GFX9) {
-               si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL,
-                                     R_00B430_SPI_SHADER_USER_DATA_LS_0);
-               si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY,
-                                     R_00B330_SPI_SHADER_USER_DATA_ES_0);
-       } else {
-               si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL,
-                                     R_00B430_SPI_SHADER_USER_DATA_HS_0);
-               si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY,
-                                     R_00B230_SPI_SHADER_USER_DATA_GS_0);
-       }
-       si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
+   int i;
+   unsigned first_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE;
+
+   for (i = first_shader; i < SI_NUM_SHADERS; i++) {
+      bool is_2nd =
+         sctx->chip_class >= GFX9 && (i == PIPE_SHADER_TESS_CTRL || i == PIPE_SHADER_GEOMETRY);
+      unsigned num_sampler_slots = SI_NUM_IMAGE_SLOTS / 2 + SI_NUM_SAMPLERS;
+      unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
+      int rel_dw_offset;
+      struct si_descriptors *desc;
+
+      if (is_2nd) {
+         if (i == PIPE_SHADER_TESS_CTRL) {
+            rel_dw_offset =
+               (R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS - R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4;
+         } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */
+            rel_dw_offset =
+               (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4;
+         } else {
+            rel_dw_offset =
+               (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4;
+         }
+      } else {
+         rel_dw_offset = SI_SGPR_CONST_AND_SHADER_BUFFERS;
+      }
+      desc = si_const_and_shader_buffer_descriptors(sctx, i);
+      si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc, num_buffer_slots,
+                               rel_dw_offset, RADEON_PRIO_SHADER_RW_BUFFER,
+                               RADEON_PRIO_CONST_BUFFER);
+      desc->slot_index_to_bind_directly = si_get_constbuf_slot(0);
+
+      if (is_2nd) {
+         if (i == PIPE_SHADER_TESS_CTRL) {
+            rel_dw_offset =
+               (R_00B40C_SPI_SHADER_USER_DATA_ADDR_HI_HS - R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4;
+         } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */
+            rel_dw_offset =
+               (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4;
+         } else {
+            rel_dw_offset =
+               (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4;
+         }
+      } else {
+         rel_dw_offset = SI_SGPR_SAMPLERS_AND_IMAGES;
+      }
+
+      desc = si_sampler_and_image_descriptors(sctx, i);
+      si_init_descriptors(desc, rel_dw_offset, 16, num_sampler_slots);
+
+      int j;
+      for (j = 0; j < SI_NUM_IMAGE_SLOTS; j++)
+         memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4);
+      for (; j < SI_NUM_IMAGE_SLOTS + SI_NUM_SAMPLERS * 2; j++)
+         memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
+   }
+
+   si_init_buffer_resources(&sctx->rw_buffers, &sctx->descriptors[SI_DESCS_RW_BUFFERS],
+                            SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
+                            /* The second priority is used by
+                             * const buffers in RW buffer slots. */
+                            RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER);
+   sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
+
+   /* Initialize an array of 1024 bindless descriptors, when the limit is
+    * reached, just make it larger and re-upload the whole array.
+    */
+   si_init_bindless_descriptors(sctx, &sctx->bindless_descriptors,
+                                SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES, 1024);
+
+   sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
+
+   /* Set pipe_context functions. */
+   sctx->b.bind_sampler_states = si_bind_sampler_states;
+   sctx->b.set_shader_images = si_set_shader_images;
+   sctx->b.set_constant_buffer = si_pipe_set_constant_buffer;
+   sctx->b.set_shader_buffers = si_set_shader_buffers;
+   sctx->b.set_sampler_views = si_set_sampler_views;
+   sctx->b.create_texture_handle = si_create_texture_handle;
+   sctx->b.delete_texture_handle = si_delete_texture_handle;
+   sctx->b.make_texture_handle_resident = si_make_texture_handle_resident;
+   sctx->b.create_image_handle = si_create_image_handle;
+   sctx->b.delete_image_handle = si_delete_image_handle;
+   sctx->b.make_image_handle_resident = si_make_image_handle_resident;
+
+   if (!sctx->has_graphics)
+      return;
+
+   sctx->b.set_polygon_stipple = si_set_polygon_stipple;
+
+   /* Shader user data. */
+   sctx->atoms.s.shader_pointers.emit = si_emit_graphics_shader_pointers;
+
+   /* Set default and immutable mappings. */
+   if (sctx->ngg) {
+      assert(sctx->chip_class >= GFX10);
+      si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+   } else {
+      si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+   }
+
+   if (sctx->chip_class == GFX9) {
+      si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_LS_0);
+      si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B330_SPI_SHADER_USER_DATA_ES_0);
+   } else {
+      si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+      si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+   }
+   si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
  }
  
  static bool si_upload_shader_descriptors(struct si_context *sctx, unsigned mask)
  {
-       unsigned dirty = sctx->descriptors_dirty & mask;
+   unsigned dirty = sctx->descriptors_dirty & mask;
  
-       /* Assume nothing will go wrong: */
-       sctx->shader_pointers_dirty |= dirty;
+   /* Assume nothing will go wrong: */
+   sctx->shader_pointers_dirty |= dirty;
  
-       while (dirty) {
-               unsigned i = u_bit_scan(&dirty);
+   while (dirty) {
+      unsigned i = u_bit_scan(&dirty);
  
-               if (!si_upload_descriptors(sctx, &sctx->descriptors[i]))
-                       return false;
-       }
+      if (!si_upload_descriptors(sctx, &sctx->descriptors[i]))
+         return false;
+   }
  
-       sctx->descriptors_dirty &= ~mask;
+   sctx->descriptors_dirty &= ~mask;
  
-       si_upload_bindless_descriptors(sctx);
+   si_upload_bindless_descriptors(sctx);
  
-       return true;
+   return true;
  }
  
  bool si_upload_graphics_shader_descriptors(struct si_context *sctx)
  {
-       const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE);
-       return si_upload_shader_descriptors(sctx, mask);
+   const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE);
+   return si_upload_shader_descriptors(sctx, mask);
  }
  
  bool si_upload_compute_shader_descriptors(struct si_context *sctx)
  {
-       /* Does not update rw_buffers as that is not needed for compute shaders
-        * and the input buffer is using the same SGPR's anyway.
-        */
-       const unsigned mask = u_bit_consecutive(SI_DESCS_FIRST_COMPUTE,
-                                               SI_NUM_DESCS - SI_DESCS_FIRST_COMPUTE);
-       return si_upload_shader_descriptors(sctx, mask);
+   /* Does not update rw_buffers as that is not needed for compute shaders
+    * and the input buffer is using the same SGPR's anyway.
+    */
+   const unsigned mask =
+      u_bit_consecutive(SI_DESCS_FIRST_COMPUTE, SI_NUM_DESCS - SI_DESCS_FIRST_COMPUTE);
+   return si_upload_shader_descriptors(sctx, mask);
  }
  
  void si_release_all_descriptors(struct si_context *sctx)
  {
-       int i;
+   int i;
  
-       for (i = 0; i < SI_NUM_SHADERS; i++) {
-               si_release_buffer_resources(&sctx->const_and_shader_buffers[i],
-                                           si_const_and_shader_buffer_descriptors(sctx, i));
-               si_release_sampler_views(&sctx->samplers[i]);
-               si_release_image_views(&sctx->images[i]);
-       }
-       si_release_buffer_resources(&sctx->rw_buffers,
-                                   &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
-       for (i = 0; i < SI_NUM_VERTEX_BUFFERS; i++)
-               pipe_vertex_buffer_unreference(&sctx->vertex_buffer[i]);
+   for (i = 0; i < SI_NUM_SHADERS; i++) {
+      si_release_buffer_resources(&sctx->const_and_shader_buffers[i],
+                                  si_const_and_shader_buffer_descriptors(sctx, i));
+      si_release_sampler_views(&sctx->samplers[i]);
+      si_release_image_views(&sctx->images[i]);
+   }
+   si_release_buffer_resources(&sctx->rw_buffers, &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
+   for (i = 0; i < SI_NUM_VERTEX_BUFFERS; i++)
+      pipe_vertex_buffer_unreference(&sctx->vertex_buffer[i]);
  
-       for (i = 0; i < SI_NUM_DESCS; ++i)
-               si_release_descriptors(&sctx->descriptors[i]);
+   for (i = 0; i < SI_NUM_DESCS; ++i)
+      si_release_descriptors(&sctx->descriptors[i]);
  
-       si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
-       sctx->vb_descriptors_gpu_list = NULL; /* points into a mapped buffer */
+   si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
+   sctx->vb_descriptors_gpu_list = NULL; /* points into a mapped buffer */
  
-       si_release_bindless_descriptors(sctx);
+   si_release_bindless_descriptors(sctx);
  }
  
  void si_gfx_resources_add_all_to_bo_list(struct si_context *sctx)
  {
-       for (unsigned i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
-               si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[i]);
-               si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i]);
-               si_image_views_begin_new_cs(sctx, &sctx->images[i]);
-       }
-       si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
-       si_vertex_buffers_begin_new_cs(sctx);
+   for (unsigned i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
+      si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[i]);
+      si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i]);
+      si_image_views_begin_new_cs(sctx, &sctx->images[i]);
+   }
+   si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
+   si_vertex_buffers_begin_new_cs(sctx);
  
-       if (sctx->bo_list_add_all_resident_resources)
-               si_resident_buffers_add_all_to_bo_list(sctx);
+   if (sctx->bo_list_add_all_resident_resources)
+      si_resident_buffers_add_all_to_bo_list(sctx);
  
-       assert(sctx->bo_list_add_all_gfx_resources);
-       sctx->bo_list_add_all_gfx_resources = false;
+   assert(sctx->bo_list_add_all_gfx_resources);
+   sctx->bo_list_add_all_gfx_resources = false;
  }
  
  void si_compute_resources_add_all_to_bo_list(struct si_context *sctx)
  {
-       unsigned sh = PIPE_SHADER_COMPUTE;
+   unsigned sh = PIPE_SHADER_COMPUTE;
  
-       si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[sh]);
-       si_sampler_views_begin_new_cs(sctx, &sctx->samplers[sh]);
-       si_image_views_begin_new_cs(sctx, &sctx->images[sh]);
-       si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
+   si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[sh]);
+   si_sampler_views_begin_new_cs(sctx, &sctx->samplers[sh]);
+   si_image_views_begin_new_cs(sctx, &sctx->images[sh]);
+   si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
  
-       if (sctx->bo_list_add_all_resident_resources)
-               si_resident_buffers_add_all_to_bo_list(sctx);
+   if (sctx->bo_list_add_all_resident_resources)
+      si_resident_buffers_add_all_to_bo_list(sctx);
  
-       assert(sctx->bo_list_add_all_compute_resources);
-       sctx->bo_list_add_all_compute_resources = false;
+   assert(sctx->bo_list_add_all_compute_resources);
+   sctx->bo_list_add_all_compute_resources = false;
  }
  
  void si_all_descriptors_begin_new_cs(struct si_context *sctx)
  {
-       for (unsigned i = 0; i < SI_NUM_DESCS; ++i)
-               si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);
-       si_descriptors_begin_new_cs(sctx, &sctx->bindless_descriptors);
+   for (unsigned i = 0; i < SI_NUM_DESCS; ++i)
+      si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);
+   si_descriptors_begin_new_cs(sctx, &sctx->bindless_descriptors);
  
-       si_shader_pointers_begin_new_cs(sctx);
+   si_shader_pointers_begin_new_cs(sctx);
  
-       sctx->bo_list_add_all_resident_resources = true;
-       sctx->bo_list_add_all_gfx_resources = true;
-       sctx->bo_list_add_all_compute_resources = true;
+   sctx->bo_list_add_all_resident_resources = true;
+   sctx->bo_list_add_all_gfx_resources = true;
+   sctx->bo_list_add_all_compute_resources = true;
  }
  
-void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
-                              uint64_t new_active_mask)
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, uint64_t new_active_mask)
  {
-       struct si_descriptors *desc = &sctx->descriptors[desc_idx];
+   struct si_descriptors *desc = &sctx->descriptors[desc_idx];
  
-       /* Ignore no-op updates and updates that disable all slots. */
-       if (!new_active_mask ||
-           new_active_mask == u_bit_consecutive64(desc->first_active_slot,
-                                                  desc->num_active_slots))
-               return;
+   /* Ignore no-op updates and updates that disable all slots. */
+   if (!new_active_mask ||
+       new_active_mask == u_bit_consecutive64(desc->first_active_slot, desc->num_active_slots))
+      return;
  
-       int first, count;
-       u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
-       assert(new_active_mask == 0);
+   int first, count;
+   u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
+   assert(new_active_mask == 0);
  
-       /* Upload/dump descriptors if slots are being enabled. */
-       if (first < desc->first_active_slot ||
-           first + count > desc->first_active_slot + desc->num_active_slots)
-               sctx->descriptors_dirty |= 1u << desc_idx;
+   /* Upload/dump descriptors if slots are being enabled. */
+   if (first < desc->first_active_slot ||
+       first + count > desc->first_active_slot + desc->num_active_slots)
+      sctx->descriptors_dirty |= 1u << desc_idx;
  
-       desc->first_active_slot = first;
-       desc->num_active_slots = count;
+   desc->first_active_slot = first;
+   desc->num_active_slots = count;
  }
  
-void si_set_active_descriptors_for_shader(struct si_context *sctx,
-                                         struct si_shader_selector *sel)
+void si_set_active_descriptors_for_shader(struct si_context *sctx, struct si_shader_selector *sel)
  {
-       if (!sel)
-               return;
+   if (!sel)
+      return;
  
-       si_set_active_descriptors(sctx,
-               si_const_and_shader_buffer_descriptors_idx(sel->type),
-               sel->active_const_and_shader_buffers);
-       si_set_active_descriptors(sctx,
-               si_sampler_and_image_descriptors_idx(sel->type),
-               sel->active_samplers_and_images);
+   si_set_active_descriptors(sctx, si_const_and_shader_buffer_descriptors_idx(sel->type),
+                             sel->active_const_and_shader_buffers);
+   si_set_active_descriptors(sctx, si_sampler_and_image_descriptors_idx(sel->type),
+                             sel->active_samplers_and_images);
  }
diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c

index c58b2b103bee22f26dcff4cd5ce61c92efd23f99..673c3310a1a9582be34478a22acd29fcef20631e 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_dma_cs.c
+++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -27,304 +27,279 @@
  
  static void si_dma_emit_wait_idle(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->sdma_cs;
+   struct radeon_cmdbuf *cs = sctx->sdma_cs;
  
-       /* NOP waits for idle. */
-       if (sctx->chip_class >= GFX7)
-               radeon_emit(cs, 0x00000000); /* NOP */
-       else
-               radeon_emit(cs, 0xf0000000); /* NOP */
+   /* NOP waits for idle. */
+   if (sctx->chip_class >= GFX7)
+      radeon_emit(cs, 0x00000000); /* NOP */
+   else
+      radeon_emit(cs, 0xf0000000); /* NOP */
  }
  
-void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst,
-                          uint64_t offset)
+void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset)
  {
-       struct radeon_cmdbuf *cs = sctx->sdma_cs;
-       uint64_t va = dst->gpu_address + offset;
+   struct radeon_cmdbuf *cs = sctx->sdma_cs;
+   uint64_t va = dst->gpu_address + offset;
  
-       if (sctx->chip_class == GFX6) {
-               unreachable("SI DMA doesn't support the timestamp packet.");
-               return;
-       }
+   if (sctx->chip_class == GFX6) {
+      unreachable("SI DMA doesn't support the timestamp packet.");
+      return;
+   }
  
-       /* Mark the buffer range of destination as valid (initialized),
-        * so that transfer_map knows it should wait for the GPU when mapping
-        * that range. */
-       util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8);
+   /* Mark the buffer range of destination as valid (initialized),
+    * so that transfer_map knows it should wait for the GPU when mapping
+    * that range. */
+   util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8);
  
-       assert(va % 8 == 0);
+   assert(va % 8 == 0);
  
-       si_need_dma_space(sctx, 4, dst, NULL);
-       si_dma_emit_wait_idle(sctx);
+   si_need_dma_space(sctx, 4, dst, NULL);
+   si_dma_emit_wait_idle(sctx);
  
-       radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP,
-                                       SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP,
-                                       0));
-       radeon_emit(cs, va);
-       radeon_emit(cs, va >> 32);
+   radeon_emit(
+      cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0));
+   radeon_emit(cs, va);
+   radeon_emit(cs, va >> 32);
  }
  
-void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
-                         uint64_t offset, uint64_t size, unsigned clear_value)
+void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
+                          uint64_t size, unsigned clear_value)
  {
-       struct radeon_cmdbuf *cs = sctx->sdma_cs;
-       unsigned i, ncopy, csize;
-       struct si_resource *sdst = si_resource(dst);
-
-       assert(offset % 4 == 0);
-       assert(size);
-       assert(size % 4 == 0);
-
-       if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
-           sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS)) {
-               sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
-               return;
-       }
-
-       /* Mark the buffer range of destination as valid (initialized),
-        * so that transfer_map knows it should wait for the GPU when mapping
-        * that range. */
-       util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
-
-       offset += sdst->gpu_address;
-
-       if (sctx->chip_class == GFX6) {
-               /* the same maximum size as for copying */
-               ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
-               si_need_dma_space(sctx, ncopy * 4, sdst, NULL);
-
-               for (i = 0; i < ncopy; i++) {
-                       csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
-                       radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0,
-                                                     csize / 4));
-                       radeon_emit(cs, offset);
-                       radeon_emit(cs, clear_value);
-                       radeon_emit(cs, (offset >> 32) << 16);
-                       offset += csize;
-                       size -= csize;
-               }
-               return;
-       }
-
-       /* The following code is for Sea Islands and later. */
-       /* the same maximum size as for copying */
-       ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
-       si_need_dma_space(sctx, ncopy * 5, sdst, NULL);
-
-       for (i = 0; i < ncopy; i++) {
-               csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
-               radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0,
-                                               0x8000 /* dword copy */));
-               radeon_emit(cs, offset);
-               radeon_emit(cs, offset >> 32);
-               radeon_emit(cs, clear_value);
-               /* dw count */
-               radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc);
-               offset += csize;
-               size -= csize;
-       }
+   struct radeon_cmdbuf *cs = sctx->sdma_cs;
+   unsigned i, ncopy, csize;
+   struct si_resource *sdst = si_resource(dst);
+
+   assert(offset % 4 == 0);
+   assert(size);
+   assert(size % 4 == 0);
+
+   if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
+       sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS)) {
+      sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
+      return;
+   }
+
+   /* Mark the buffer range of destination as valid (initialized),
+    * so that transfer_map knows it should wait for the GPU when mapping
+    * that range. */
+   util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
+
+   offset += sdst->gpu_address;
+
+   if (sctx->chip_class == GFX6) {
+      /* the same maximum size as for copying */
+      ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
+      si_need_dma_space(sctx, ncopy * 4, sdst, NULL);
+
+      for (i = 0; i < ncopy; i++) {
+         csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
+         radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, csize / 4));
+         radeon_emit(cs, offset);
+         radeon_emit(cs, clear_value);
+         radeon_emit(cs, (offset >> 32) << 16);
+         offset += csize;
+         size -= csize;
+      }
+      return;
+   }
+
+   /* The following code is for Sea Islands and later. */
+   /* the same maximum size as for copying */
+   ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
+   si_need_dma_space(sctx, ncopy * 5, sdst, NULL);
+
+   for (i = 0; i < ncopy; i++) {
+      csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
+      radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, 0x8000 /* dword copy */));
+      radeon_emit(cs, offset);
+      radeon_emit(cs, offset >> 32);
+      radeon_emit(cs, clear_value);
+      /* dw count */
+      radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc);
+      offset += csize;
+      size -= csize;
+   }
  }
  
  void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
-                        struct pipe_resource *src, uint64_t dst_offset,
-                        uint64_t src_offset, uint64_t size)
+                         struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
+                         uint64_t size)
  {
-       struct radeon_cmdbuf *cs = sctx->sdma_cs;
-       unsigned i, ncopy, csize;
-       struct si_resource *sdst = si_resource(dst);
-       struct si_resource *ssrc = si_resource(src);
-
-       if (!cs ||
-           dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
-           src->flags & PIPE_RESOURCE_FLAG_SPARSE) {
-               si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
-               return;
-       }
-
-       /* Mark the buffer range of destination as valid (initialized),
-        * so that transfer_map knows it should wait for the GPU when mapping
-        * that range. */
-       util_range_add(dst, &sdst->valid_buffer_range, dst_offset,
-                      dst_offset + size);
-
-       dst_offset += sdst->gpu_address;
-       src_offset += ssrc->gpu_address;
-
-       if (sctx->chip_class == GFX6) {
-               unsigned max_size, sub_cmd, shift;
-
-               /* see whether we should use the dword-aligned or byte-aligned copy */
-               if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
-                       sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
-                       shift = 2;
-                       max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
-               } else {
-                       sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
-                       shift = 0;
-                       max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
-               }
-
-               ncopy = DIV_ROUND_UP(size, max_size);
-               si_need_dma_space(sctx, ncopy * 5, sdst, ssrc);
-
-               for (i = 0; i < ncopy; i++) {
-                       csize = MIN2(size, max_size);
-                       radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd,
-                                                     csize >> shift));
-                       radeon_emit(cs, dst_offset);
-                       radeon_emit(cs, src_offset);
-                       radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
-                       radeon_emit(cs, (src_offset >> 32UL) & 0xff);
-                       dst_offset += csize;
-                       src_offset += csize;
-                       size -= csize;
-               }
-               return;
-       }
-
-       /* The following code is for CI and later. */
-       unsigned align = ~0u;
-       ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
-
-       /* Align copy size to dw if src/dst address are dw aligned */
-       if ((src_offset & 0x3) == 0 &&
-           (dst_offset & 0x3) == 0 &&
-           size > 4 &&
-           (size & 3) != 0) {
-               align = ~0x3u;
-               ncopy++;
-       }
-
-       si_need_dma_space(sctx, ncopy * 7, sdst, ssrc);
-
-       for (i = 0; i < ncopy; i++) {
-               csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size;
-               radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
-                                               CIK_SDMA_COPY_SUB_OPCODE_LINEAR,
-                                               0));
-               radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
-               radeon_emit(cs, 0); /* src/dst endian swap */
-               radeon_emit(cs, src_offset);
-               radeon_emit(cs, src_offset >> 32);
-               radeon_emit(cs, dst_offset);
-               radeon_emit(cs, dst_offset >> 32);
-               dst_offset += csize;
-               src_offset += csize;
-               size -= csize;
-       }
+   struct radeon_cmdbuf *cs = sctx->sdma_cs;
+   unsigned i, ncopy, csize;
+   struct si_resource *sdst = si_resource(dst);
+   struct si_resource *ssrc = si_resource(src);
+
+   if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || src->flags & PIPE_RESOURCE_FLAG_SPARSE) {
+      si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
+      return;
+   }
+
+   /* Mark the buffer range of destination as valid (initialized),
+    * so that transfer_map knows it should wait for the GPU when mapping
+    * that range. */
+   util_range_add(dst, &sdst->valid_buffer_range, dst_offset, dst_offset + size);
+
+   dst_offset += sdst->gpu_address;
+   src_offset += ssrc->gpu_address;
+
+   if (sctx->chip_class == GFX6) {
+      unsigned max_size, sub_cmd, shift;
+
+      /* see whether we should use the dword-aligned or byte-aligned copy */
+      if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
+         sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
+         shift = 2;
+         max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
+      } else {
+         sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
+         shift = 0;
+         max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
+      }
+
+      ncopy = DIV_ROUND_UP(size, max_size);
+      si_need_dma_space(sctx, ncopy * 5, sdst, ssrc);
+
+      for (i = 0; i < ncopy; i++) {
+         csize = MIN2(size, max_size);
+         radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize >> shift));
+         radeon_emit(cs, dst_offset);
+         radeon_emit(cs, src_offset);
+         radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
+         radeon_emit(cs, (src_offset >> 32UL) & 0xff);
+         dst_offset += csize;
+         src_offset += csize;
+         size -= csize;
+      }
+      return;
+   }
+
+   /* The following code is for CI and later. */
+   unsigned align = ~0u;
+   ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
+
+   /* Align copy size to dw if src/dst address are dw aligned */
+   if ((src_offset & 0x3) == 0 && (dst_offset & 0x3) == 0 && size > 4 && (size & 3) != 0) {
+      align = ~0x3u;
+      ncopy++;
+   }
+
+   si_need_dma_space(sctx, ncopy * 7, sdst, ssrc);
+
+   for (i = 0; i < ncopy; i++) {
+      csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size;
+      radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR, 0));
+      radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
+      radeon_emit(cs, 0); /* src/dst endian swap */
+      radeon_emit(cs, src_offset);
+      radeon_emit(cs, src_offset >> 32);
+      radeon_emit(cs, dst_offset);
+      radeon_emit(cs, dst_offset >> 32);
+      dst_offset += csize;
+      src_offset += csize;
+      size -= csize;
+   }
  }
  
-void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
-                      struct si_resource *dst, struct si_resource *src)
+void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst,
+                       struct si_resource *src)
  {
-       struct radeon_winsys *ws = ctx->ws;
-       uint64_t vram = ctx->sdma_cs->used_vram;
-       uint64_t gtt = ctx->sdma_cs->used_gart;
-
-       if (dst) {
-               vram += dst->vram_usage;
-               gtt += dst->gart_usage;
-       }
-       if (src) {
-               vram += src->vram_usage;
-               gtt += src->gart_usage;
-       }
-
-       /* Flush the GFX IB if DMA depends on it. */
-       if (!ctx->sdma_uploads_in_progress &&
-           radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
-           ((dst &&
-             ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
-                                         RADEON_USAGE_READWRITE)) ||
-            (src &&
-             ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf,
-                                         RADEON_USAGE_WRITE))))
-               si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-
-       /* Flush if there's not enough space, or if the memory usage per IB
-        * is too large.
-        *
-        * IBs using too little memory are limited by the IB submission overhead.
-        * IBs using too much memory are limited by the kernel/TTM overhead.
-        * Too long IBs create CPU-GPU pipeline bubbles and add latency.
-        *
-        * This heuristic makes sure that DMA requests are executed
-        * very soon after the call is made and lowers memory usage.
-        * It improves texture upload performance by keeping the DMA
-        * engine busy while uploads are being submitted.
-        */
-       num_dw++; /* for emit_wait_idle below */
-       if (!ctx->sdma_uploads_in_progress &&
-           (!ws->cs_check_space(ctx->sdma_cs, num_dw, false) ||
-            ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 ||
-            !radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) {
-               si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
-               assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw);
-       }
-
-       /* Wait for idle if either buffer has been used in the IB before to
-        * prevent read-after-write hazards.
-        */
-       if ((dst &&
-            ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf,
-                                        RADEON_USAGE_READWRITE)) ||
-           (src &&
-            ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf,
-                                        RADEON_USAGE_WRITE)))
-               si_dma_emit_wait_idle(ctx);
-
-       unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
-       if (dst) {
-               ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync,
-                                 dst->domains, 0);
-       }
-       if (src) {
-               ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync,
-                                 src->domains, 0);
-       }
-
-       /* this function is called before all DMA calls, so increment this. */
-       ctx->num_dma_calls++;
+   struct radeon_winsys *ws = ctx->ws;
+   uint64_t vram = ctx->sdma_cs->used_vram;
+   uint64_t gtt = ctx->sdma_cs->used_gart;
+
+   if (dst) {
+      vram += dst->vram_usage;
+      gtt += dst->gart_usage;
+   }
+   if (src) {
+      vram += src->vram_usage;
+      gtt += src->gart_usage;
+   }
+
+   /* Flush the GFX IB if DMA depends on it. */
+   if (!ctx->sdma_uploads_in_progress && radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
+       ((dst && ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
+        (src && ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf, RADEON_USAGE_WRITE))))
+      si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+
+   /* Flush if there's not enough space, or if the memory usage per IB
+    * is too large.
+    *
+    * IBs using too little memory are limited by the IB submission overhead.
+    * IBs using too much memory are limited by the kernel/TTM overhead.
+    * Too long IBs create CPU-GPU pipeline bubbles and add latency.
+    *
+    * This heuristic makes sure that DMA requests are executed
+    * very soon after the call is made and lowers memory usage.
+    * It improves texture upload performance by keeping the DMA
+    * engine busy while uploads are being submitted.
+    */
+   num_dw++; /* for emit_wait_idle below */
+   if (!ctx->sdma_uploads_in_progress &&
+       (!ws->cs_check_space(ctx->sdma_cs, num_dw, false) ||
+        ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 ||
+        !radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) {
+      si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+      assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw);
+   }
+
+   /* Wait for idle if either buffer has been used in the IB before to
+    * prevent read-after-write hazards.
+    */
+   if ((dst && ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
+       (src && ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf, RADEON_USAGE_WRITE)))
+      si_dma_emit_wait_idle(ctx);
+
+   unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
+   if (dst) {
+      ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync, dst->domains, 0);
+   }
+   if (src) {
+      ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync, src->domains, 0);
+   }
+
+   /* this function is called before all DMA calls, so increment this. */
+   ctx->num_dma_calls++;
  }
  
-void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
-                    struct pipe_fence_handle **fence)
+void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
  {
-       struct radeon_cmdbuf *cs = ctx->sdma_cs;
-       struct radeon_saved_cs saved;
-       bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
-
-       if (!radeon_emitted(cs, 0)) {
-               if (fence)
-                       ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
-               return;
-       }
-
-       if (check_vm)
-               si_save_cs(ctx->ws, cs, &saved, true);
-
-       ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
-       if (fence)
-               ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
-
-       if (check_vm) {
-               /* Use conservative timeout 800ms, after which we won't wait any
-                * longer and assume the GPU is hung.
-                */
-               ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800*1000*1000);
-
-               si_check_vm_faults(ctx, &saved, RING_DMA);
-               si_clear_saved_cs(&saved);
-       }
+   struct radeon_cmdbuf *cs = ctx->sdma_cs;
+   struct radeon_saved_cs saved;
+   bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
+
+   if (!radeon_emitted(cs, 0)) {
+      if (fence)
+         ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
+      return;
+   }
+
+   if (check_vm)
+      si_save_cs(ctx->ws, cs, &saved, true);
+
+   ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
+   if (fence)
+      ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
+
+   if (check_vm) {
+      /* Use conservative timeout 800ms, after which we won't wait any
+       * longer and assume the GPU is hung.
+       */
+      ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800 * 1000 * 1000);
+
+      si_check_vm_faults(ctx, &saved, RING_DMA);
+      si_clear_saved_cs(&saved);
+   }
  }
  
-void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
-                           uint64_t offset, uint64_t size, unsigned value)
+void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
+                            uint64_t size, unsigned value)
  {
-       struct si_context *ctx = (struct si_context*)sscreen->aux_context;
+   struct si_context *ctx = (struct si_context *)sscreen->aux_context;
  
-       simple_mtx_lock(&sscreen->aux_context_lock);
-       si_sdma_clear_buffer(ctx, dst, offset, size, value);
-       sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
-       simple_mtx_unlock(&sscreen->aux_context_lock);
+   simple_mtx_lock(&sscreen->aux_context_lock);
+   si_sdma_clear_buffer(ctx, dst, offset, size, value);
+   sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
+   simple_mtx_unlock(&sscreen->aux_context_lock);
  }
diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c

index 26b5fc4bdba80654b4d5e2bd8c8f39ab6a881b7d..91d1bed505d285bac39704871d59508788f58a8c 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@@ -23,34 +23,33 @@
   *
   */
  
-#include <libsync.h>
-
+#include "si_build_pm4.h"
  #include "util/os_time.h"
  #include "util/u_memory.h"
  #include "util/u_queue.h"
  #include "util/u_upload_mgr.h"
  
-#include "si_build_pm4.h"
+#include <libsync.h>
  
  struct si_fine_fence {
-       struct si_resource *buf;
-       unsigned offset;
+   struct si_resource *buf;
+   unsigned offset;
  };
  
  struct si_multi_fence {
-       struct pipe_reference reference;
-       struct pipe_fence_handle *gfx;
-       struct pipe_fence_handle *sdma;
-       struct tc_unflushed_batch_token *tc_token;
-       struct util_queue_fence ready;
-
-       /* If the context wasn't flushed at fence creation, this is non-NULL. */
-       struct {
-               struct si_context *ctx;
-               unsigned ib_index;
-       } gfx_unflushed;
-
-       struct si_fine_fence fine;
+   struct pipe_reference reference;
+   struct pipe_fence_handle *gfx;
+   struct pipe_fence_handle *sdma;
+   struct tc_unflushed_batch_token *tc_token;
+   struct util_queue_fence ready;
+
+   /* If the context wasn't flushed at fence creation, this is non-NULL. */
+   struct {
+      struct si_context *ctx;
+      unsigned ib_index;
+   } gfx_unflushed;
+
+   struct si_fine_fence fine;
  };
  
  /**
@@ -66,591 +65,554 @@ struct si_multi_fence {
   * \param old_value    Previous fence value (for a bug workaround)
   * \param new_value    Fence value to write for this event.
   */
-void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
-                      unsigned event, unsigned event_flags,
-                      unsigned dst_sel, unsigned int_sel, unsigned data_sel,
-                      struct si_resource *buf, uint64_t va,
-                      uint32_t new_fence, unsigned query_type)
+void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigned event,
+                       unsigned event_flags, unsigned dst_sel, unsigned int_sel, unsigned data_sel,
+                       struct si_resource *buf, uint64_t va, uint32_t new_fence,
+                       unsigned query_type)
  {
-       unsigned op = EVENT_TYPE(event) |
-                     EVENT_INDEX(event == V_028A90_CS_DONE ||
-                                 event == V_028A90_PS_DONE ? 6 : 5) |
-                     event_flags;
-       unsigned sel = EOP_DST_SEL(dst_sel) |
-                      EOP_INT_SEL(int_sel) |
-                      EOP_DATA_SEL(data_sel);
-       bool compute_ib = !ctx->has_graphics ||
-                         cs == ctx->prim_discard_compute_cs;
-
-       if (ctx->chip_class >= GFX9 ||
-           (compute_ib && ctx->chip_class >= GFX7)) {
-               /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
-                * counters) must immediately precede every timestamp event to
-                * prevent a GPU hang on GFX9.
-                *
-                * Occlusion queries don't need to do it here, because they
-                * always do ZPASS_DONE before the timestamp.
-                */
-               if (ctx->chip_class == GFX9 && !compute_ib &&
-                   query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
-                   query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
-                   query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
-                       struct si_resource *scratch = ctx->eop_bug_scratch;
-
-                       assert(16 * ctx->screen->info.num_render_backends <=
-                              scratch->b.b.width0);
-                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-                       radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
-                       radeon_emit(cs, scratch->gpu_address);
-                       radeon_emit(cs, scratch->gpu_address >> 32);
-
-                       radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch,
-                                                 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
-               }
-
-               radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0));
-               radeon_emit(cs, op);
-               radeon_emit(cs, sel);
-               radeon_emit(cs, va);            /* address lo */
-               radeon_emit(cs, va >> 32);      /* address hi */
-               radeon_emit(cs, new_fence);     /* immediate data lo */
-               radeon_emit(cs, 0); /* immediate data hi */
-               if (ctx->chip_class >= GFX9)
-                       radeon_emit(cs, 0); /* unused */
-       } else {
-               if (ctx->chip_class == GFX7 ||
-                   ctx->chip_class == GFX8) {
-                       struct si_resource *scratch = ctx->eop_bug_scratch;
-                       uint64_t va = scratch->gpu_address;
-
-                       /* Two EOP events are required to make all engines go idle
-                        * (and optional cache flushes executed) before the timestamp
-                        * is written.
-                        */
-                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-                       radeon_emit(cs, op);
-                       radeon_emit(cs, va);
-                       radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
-                       radeon_emit(cs, 0); /* immediate data */
-                       radeon_emit(cs, 0); /* unused */
-
-                       radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch,
-                                                 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
-               }
-
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-               radeon_emit(cs, op);
-               radeon_emit(cs, va);
-               radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
-               radeon_emit(cs, new_fence); /* immediate data */
-               radeon_emit(cs, 0); /* unused */
-       }
-
-       if (buf) {
-               radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_WRITE,
-                                         RADEON_PRIO_QUERY);
-       }
+   unsigned op = EVENT_TYPE(event) |
+                 EVENT_INDEX(event == V_028A90_CS_DONE || event == V_028A90_PS_DONE ? 6 : 5) |
+                 event_flags;
+   unsigned sel = EOP_DST_SEL(dst_sel) | EOP_INT_SEL(int_sel) | EOP_DATA_SEL(data_sel);
+   bool compute_ib = !ctx->has_graphics || cs == ctx->prim_discard_compute_cs;
+
+   if (ctx->chip_class >= GFX9 || (compute_ib && ctx->chip_class >= GFX7)) {
+      /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
+       * counters) must immediately precede every timestamp event to
+       * prevent a GPU hang on GFX9.
+       *
+       * Occlusion queries don't need to do it here, because they
+       * always do ZPASS_DONE before the timestamp.
+       */
+      if (ctx->chip_class == GFX9 && !compute_ib && query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
+          query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
+          query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+         struct si_resource *scratch = ctx->eop_bug_scratch;
+
+         assert(16 * ctx->screen->info.num_render_backends <= scratch->b.b.width0);
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
+         radeon_emit(cs, scratch->gpu_address);
+         radeon_emit(cs, scratch->gpu_address >> 32);
+
+         radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch, RADEON_USAGE_WRITE,
+                                   RADEON_PRIO_QUERY);
+      }
+
+      radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0));
+      radeon_emit(cs, op);
+      radeon_emit(cs, sel);
+      radeon_emit(cs, va);        /* address lo */
+      radeon_emit(cs, va >> 32);  /* address hi */
+      radeon_emit(cs, new_fence); /* immediate data lo */
+      radeon_emit(cs, 0);         /* immediate data hi */
+      if (ctx->chip_class >= GFX9)
+         radeon_emit(cs, 0); /* unused */
+   } else {
+      if (ctx->chip_class == GFX7 || ctx->chip_class == GFX8) {
+         struct si_resource *scratch = ctx->eop_bug_scratch;
+         uint64_t va = scratch->gpu_address;
+
+         /* Two EOP events are required to make all engines go idle
+          * (and optional cache flushes executed) before the timestamp
+          * is written.
+          */
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+         radeon_emit(cs, op);
+         radeon_emit(cs, va);
+         radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
+         radeon_emit(cs, 0); /* immediate data */
+         radeon_emit(cs, 0); /* unused */
+
+         radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch, RADEON_USAGE_WRITE,
+                                   RADEON_PRIO_QUERY);
+      }
+
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+      radeon_emit(cs, op);
+      radeon_emit(cs, va);
+      radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
+      radeon_emit(cs, new_fence); /* immediate data */
+      radeon_emit(cs, 0);         /* unused */
+   }
+
+   if (buf) {
+      radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+   }
  }
  
  unsigned si_cp_write_fence_dwords(struct si_screen *screen)
  {
-       unsigned dwords = 6;
+   unsigned dwords = 6;
  
-       if (screen->info.chip_class == GFX7 ||
-           screen->info.chip_class == GFX8)
-               dwords *= 2;
+   if (screen->info.chip_class == GFX7 || screen->info.chip_class == GFX8)
+      dwords *= 2;
  
-       return dwords;
+   return dwords;
  }
  
-void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
-                   uint64_t va, uint32_t ref, uint32_t mask, unsigned flags)
+void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, uint64_t va, uint32_t ref,
+                    uint32_t mask, unsigned flags)
  {
-       radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-       radeon_emit(cs, WAIT_REG_MEM_MEM_SPACE(1) | flags);
-       radeon_emit(cs, va);
-       radeon_emit(cs, va >> 32);
-       radeon_emit(cs, ref); /* reference value */
-       radeon_emit(cs, mask); /* mask */
-       radeon_emit(cs, 4); /* poll interval */
+   radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+   radeon_emit(cs, WAIT_REG_MEM_MEM_SPACE(1) | flags);
+   radeon_emit(cs, va);
+   radeon_emit(cs, va >> 32);
+   radeon_emit(cs, ref);  /* reference value */
+   radeon_emit(cs, mask); /* mask */
+   radeon_emit(cs, 4);    /* poll interval */
  }
  
-static void si_add_fence_dependency(struct si_context *sctx,
-                                   struct pipe_fence_handle *fence)
+static void si_add_fence_dependency(struct si_context *sctx, struct pipe_fence_handle *fence)
  {
-       struct radeon_winsys *ws = sctx->ws;
+   struct radeon_winsys *ws = sctx->ws;
  
-       if (sctx->sdma_cs)
-               ws->cs_add_fence_dependency(sctx->sdma_cs, fence, 0);
-       ws->cs_add_fence_dependency(sctx->gfx_cs, fence, 0);
+   if (sctx->sdma_cs)
+      ws->cs_add_fence_dependency(sctx->sdma_cs, fence, 0);
+   ws->cs_add_fence_dependency(sctx->gfx_cs, fence, 0);
  }
  
-static void si_add_syncobj_signal(struct si_context *sctx,
-                                 struct pipe_fence_handle *fence)
+static void si_add_syncobj_signal(struct si_context *sctx, struct pipe_fence_handle *fence)
  {
-       sctx->ws->cs_add_syncobj_signal(sctx->gfx_cs, fence);
+   sctx->ws->cs_add_syncobj_signal(sctx->gfx_cs, fence);
  }
  
-static void si_fence_reference(struct pipe_screen *screen,
-                              struct pipe_fence_handle **dst,
-                              struct pipe_fence_handle *src)
+static void si_fence_reference(struct pipe_screen *screen, struct pipe_fence_handle **dst,
+                               struct pipe_fence_handle *src)
  {
-       struct radeon_winsys *ws = ((struct si_screen*)screen)->ws;
-       struct si_multi_fence **sdst = (struct si_multi_fence **)dst;
-       struct si_multi_fence *ssrc = (struct si_multi_fence *)src;
-
-       if (pipe_reference(&(*sdst)->reference, &ssrc->reference)) {
-               ws->fence_reference(&(*sdst)->gfx, NULL);
-               ws->fence_reference(&(*sdst)->sdma, NULL);
-               tc_unflushed_batch_token_reference(&(*sdst)->tc_token, NULL);
-               si_resource_reference(&(*sdst)->fine.buf, NULL);
-               FREE(*sdst);
-       }
-        *sdst = ssrc;
+   struct radeon_winsys *ws = ((struct si_screen *)screen)->ws;
+   struct si_multi_fence **sdst = (struct si_multi_fence **)dst;
+   struct si_multi_fence *ssrc = (struct si_multi_fence *)src;
+
+   if (pipe_reference(&(*sdst)->reference, &ssrc->reference)) {
+      ws->fence_reference(&(*sdst)->gfx, NULL);
+      ws->fence_reference(&(*sdst)->sdma, NULL);
+      tc_unflushed_batch_token_reference(&(*sdst)->tc_token, NULL);
+      si_resource_reference(&(*sdst)->fine.buf, NULL);
+      FREE(*sdst);
+   }
+   *sdst = ssrc;
  }
  
  static struct si_multi_fence *si_create_multi_fence()
  {
-       struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
-       if (!fence)
-               return NULL;
+   struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
+   if (!fence)
+      return NULL;
  
-       pipe_reference_init(&fence->reference, 1);
-       util_queue_fence_init(&fence->ready);
+   pipe_reference_init(&fence->reference, 1);
+   util_queue_fence_init(&fence->ready);
  
-       return fence;
+   return fence;
  }
  
  struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
-                                         struct tc_unflushed_batch_token *tc_token)
+                                          struct tc_unflushed_batch_token *tc_token)
  {
-       struct si_multi_fence *fence = si_create_multi_fence();
-       if (!fence)
-               return NULL;
+   struct si_multi_fence *fence = si_create_multi_fence();
+   if (!fence)
+      return NULL;
  
-       util_queue_fence_reset(&fence->ready);
-       tc_unflushed_batch_token_reference(&fence->tc_token, tc_token);
+   util_queue_fence_reset(&fence->ready);
+   tc_unflushed_batch_token_reference(&fence->tc_token, tc_token);
  
-       return (struct pipe_fence_handle *)fence;
+   return (struct pipe_fence_handle *)fence;
  }
  
-static bool si_fine_fence_signaled(struct radeon_winsys *rws,
-                                  const struct si_fine_fence *fine)
+static bool si_fine_fence_signaled(struct radeon_winsys *rws, const struct si_fine_fence *fine)
  {
-       char *map = rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ |
-                                                         PIPE_TRANSFER_UNSYNCHRONIZED);
-       if (!map)
-               return false;
+   char *map =
+      rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ | PIPE_TRANSFER_UNSYNCHRONIZED);
+   if (!map)
+      return false;
  
-       uint32_t *fence = (uint32_t*)(map + fine->offset);
-       return *fence != 0;
+   uint32_t *fence = (uint32_t *)(map + fine->offset);
+   return *fence != 0;
  }
  
-static void si_fine_fence_set(struct si_context *ctx,
-                             struct si_fine_fence *fine,
-                             unsigned flags)
+static void si_fine_fence_set(struct si_context *ctx, struct si_fine_fence *fine, unsigned flags)
  {
-       uint32_t *fence_ptr;
-
-       assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1);
-
-       /* Use cached system memory for the fence. */
-       u_upload_alloc(ctx->cached_gtt_allocator, 0, 4, 4,
-                      &fine->offset, (struct pipe_resource **)&fine->buf, (void **)&fence_ptr);
-       if (!fine->buf)
-               return;
-
-       *fence_ptr = 0;
-
-       if (flags & PIPE_FLUSH_TOP_OF_PIPE) {
-               uint32_t value = 0x80000000;
-
-               si_cp_write_data(ctx, fine->buf, fine->offset, 4,
-                                V_370_MEM, V_370_PFP, &value);
-       } else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) {
-               uint64_t fence_va = fine->buf->gpu_address + fine->offset;
-
-               radeon_add_to_buffer_list(ctx, ctx->gfx_cs, fine->buf,
-                                         RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
-               si_cp_release_mem(ctx, ctx->gfx_cs,
-                                 V_028A90_BOTTOM_OF_PIPE_TS, 0,
-                                 EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-                                 EOP_DATA_SEL_VALUE_32BIT,
-                                 NULL, fence_va, 0x80000000,
-                                 PIPE_QUERY_GPU_FINISHED);
-       } else {
-               assert(false);
-       }
+   uint32_t *fence_ptr;
+
+   assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1);
+
+   /* Use cached system memory for the fence. */
+   u_upload_alloc(ctx->cached_gtt_allocator, 0, 4, 4, &fine->offset,
+                  (struct pipe_resource **)&fine->buf, (void **)&fence_ptr);
+   if (!fine->buf)
+      return;
+
+   *fence_ptr = 0;
+
+   if (flags & PIPE_FLUSH_TOP_OF_PIPE) {
+      uint32_t value = 0x80000000;
+
+      si_cp_write_data(ctx, fine->buf, fine->offset, 4, V_370_MEM, V_370_PFP, &value);
+   } else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) {
+      uint64_t fence_va = fine->buf->gpu_address + fine->offset;
+
+      radeon_add_to_buffer_list(ctx, ctx->gfx_cs, fine->buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+      si_cp_release_mem(ctx, ctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
+                        EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, NULL, fence_va, 0x80000000,
+                        PIPE_QUERY_GPU_FINISHED);
+   } else {
+      assert(false);
+   }
  }
  
-static bool si_fence_finish(struct pipe_screen *screen,
-                           struct pipe_context *ctx,
-                           struct pipe_fence_handle *fence,
-                           uint64_t timeout)
+static bool si_fence_finish(struct pipe_screen *screen, struct pipe_context *ctx,
+                            struct pipe_fence_handle *fence, uint64_t timeout)
  {
-       struct radeon_winsys *rws = ((struct si_screen*)screen)->ws;
-       struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
-       struct si_context *sctx;
-       int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
-
-       ctx = threaded_context_unwrap_sync(ctx);
-       sctx = (struct si_context*)(ctx ? ctx : NULL);
-
-       if (!util_queue_fence_is_signalled(&sfence->ready)) {
-               if (sfence->tc_token) {
-                       /* Ensure that si_flush_from_st will be called for
-                        * this fence, but only if we're in the API thread
-                        * where the context is current.
-                        *
-                        * Note that the batch containing the flush may already
-                        * be in flight in the driver thread, so the fence
-                        * may not be ready yet when this call returns.
-                        */
-                       threaded_context_flush(ctx, sfence->tc_token,
-                                              timeout == 0);
-               }
-
-               if (!timeout)
-                       return false;
-
-               if (timeout == PIPE_TIMEOUT_INFINITE) {
-                       util_queue_fence_wait(&sfence->ready);
-               } else {
-                       if (!util_queue_fence_wait_timeout(&sfence->ready, abs_timeout))
-                               return false;
-               }
-
-               if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
-                       int64_t time = os_time_get_nano();
-                       timeout = abs_timeout > time ? abs_timeout - time : 0;
-               }
-       }
-
-       if (sfence->sdma) {
-               if (!rws->fence_wait(rws, sfence->sdma, timeout))
-                       return false;
-
-               /* Recompute the timeout after waiting. */
-               if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
-                       int64_t time = os_time_get_nano();
-                       timeout = abs_timeout > time ? abs_timeout - time : 0;
-               }
-       }
-
-       if (!sfence->gfx)
-               return true;
-
-       if (sfence->fine.buf &&
-           si_fine_fence_signaled(rws, &sfence->fine)) {
-               rws->fence_reference(&sfence->gfx, NULL);
-               si_resource_reference(&sfence->fine.buf, NULL);
-               return true;
-       }
-
-       /* Flush the gfx IB if it hasn't been flushed yet. */
-       if (sctx && sfence->gfx_unflushed.ctx == sctx &&
-           sfence->gfx_unflushed.ib_index == sctx->num_gfx_cs_flushes) {
-               /* Section 4.1.2 (Signaling) of the OpenGL 4.6 (Core profile)
-                * spec says:
-                *
-                *    "If the sync object being blocked upon will not be
-                *     signaled in finite time (for example, by an associated
-                *     fence command issued previously, but not yet flushed to
-                *     the graphics pipeline), then ClientWaitSync may hang
-                *     forever. To help prevent this behavior, if
-                *     ClientWaitSync is called and all of the following are
-                *     true:
-                *
-                *     * the SYNC_FLUSH_COMMANDS_BIT bit is set in flags,
-                *     * sync is unsignaled when ClientWaitSync is called,
-                *     * and the calls to ClientWaitSync and FenceSync were
-                *       issued from the same context,
-                *
-                *     then the GL will behave as if the equivalent of Flush
-                *     were inserted immediately after the creation of sync."
-                *
-                * This means we need to flush for such fences even when we're
-                * not going to wait.
-                */
-               si_flush_gfx_cs(sctx,
-                               (timeout ? 0 : PIPE_FLUSH_ASYNC) |
-                                RADEON_FLUSH_START_NEXT_GFX_IB_NOW,
-                               NULL);
-               sfence->gfx_unflushed.ctx = NULL;
-
-               if (!timeout)
-                       return false;
-
-               /* Recompute the timeout after all that. */
-               if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
-                       int64_t time = os_time_get_nano();
-                       timeout = abs_timeout > time ? abs_timeout - time : 0;
-               }
-       }
-
-       if (rws->fence_wait(rws, sfence->gfx, timeout))
-               return true;
-
-       /* Re-check in case the GPU is slow or hangs, but the commands before
-        * the fine-grained fence have completed. */
-       if (sfence->fine.buf &&
-           si_fine_fence_signaled(rws, &sfence->fine))
-               return true;
-
-       return false;
+   struct radeon_winsys *rws = ((struct si_screen *)screen)->ws;
+   struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+   struct si_context *sctx;
+   int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
+
+   ctx = threaded_context_unwrap_sync(ctx);
+   sctx = (struct si_context *)(ctx ? ctx : NULL);
+
+   if (!util_queue_fence_is_signalled(&sfence->ready)) {
+      if (sfence->tc_token) {
+         /* Ensure that si_flush_from_st will be called for
+          * this fence, but only if we're in the API thread
+          * where the context is current.
+          *
+          * Note that the batch containing the flush may already
+          * be in flight in the driver thread, so the fence
+          * may not be ready yet when this call returns.
+          */
+         threaded_context_flush(ctx, sfence->tc_token, timeout == 0);
+      }
+
+      if (!timeout)
+         return false;
+
+      if (timeout == PIPE_TIMEOUT_INFINITE) {
+         util_queue_fence_wait(&sfence->ready);
+      } else {
+         if (!util_queue_fence_wait_timeout(&sfence->ready, abs_timeout))
+            return false;
+      }
+
+      if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+         int64_t time = os_time_get_nano();
+         timeout = abs_timeout > time ? abs_timeout - time : 0;
+      }
+   }
+
+   if (sfence->sdma) {
+      if (!rws->fence_wait(rws, sfence->sdma, timeout))
+         return false;
+
+      /* Recompute the timeout after waiting. */
+      if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+         int64_t time = os_time_get_nano();
+         timeout = abs_timeout > time ? abs_timeout - time : 0;
+      }
+   }
+
+   if (!sfence->gfx)
+      return true;
+
+   if (sfence->fine.buf && si_fine_fence_signaled(rws, &sfence->fine)) {
+      rws->fence_reference(&sfence->gfx, NULL);
+      si_resource_reference(&sfence->fine.buf, NULL);
+      return true;
+   }
+
+   /* Flush the gfx IB if it hasn't been flushed yet. */
+   if (sctx && sfence->gfx_unflushed.ctx == sctx &&
+       sfence->gfx_unflushed.ib_index == sctx->num_gfx_cs_flushes) {
+      /* Section 4.1.2 (Signaling) of the OpenGL 4.6 (Core profile)
+       * spec says:
+       *
+       *    "If the sync object being blocked upon will not be
+       *     signaled in finite time (for example, by an associated
+       *     fence command issued previously, but not yet flushed to
+       *     the graphics pipeline), then ClientWaitSync may hang
+       *     forever. To help prevent this behavior, if
+       *     ClientWaitSync is called and all of the following are
+       *     true:
+       *
+       *     * the SYNC_FLUSH_COMMANDS_BIT bit is set in flags,
+       *     * sync is unsignaled when ClientWaitSync is called,
+       *     * and the calls to ClientWaitSync and FenceSync were
+       *       issued from the same context,
+       *
+       *     then the GL will behave as if the equivalent of Flush
+       *     were inserted immediately after the creation of sync."
+       *
+       * This means we need to flush for such fences even when we're
+       * not going to wait.
+       */
+      si_flush_gfx_cs(sctx, (timeout ? 0 : PIPE_FLUSH_ASYNC) | RADEON_FLUSH_START_NEXT_GFX_IB_NOW,
+                      NULL);
+      sfence->gfx_unflushed.ctx = NULL;
+
+      if (!timeout)
+         return false;
+
+      /* Recompute the timeout after all that. */
+      if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+         int64_t time = os_time_get_nano();
+         timeout = abs_timeout > time ? abs_timeout - time : 0;
+      }
+   }
+
+   if (rws->fence_wait(rws, sfence->gfx, timeout))
+      return true;
+
+   /* Re-check in case the GPU is slow or hangs, but the commands before
+    * the fine-grained fence have completed. */
+   if (sfence->fine.buf && si_fine_fence_signaled(rws, &sfence->fine))
+      return true;
+
+   return false;
  }
  
-static void si_create_fence_fd(struct pipe_context *ctx,
-                              struct pipe_fence_handle **pfence, int fd,
-                              enum pipe_fd_type type)
+static void si_create_fence_fd(struct pipe_context *ctx, struct pipe_fence_handle **pfence, int fd,
+                               enum pipe_fd_type type)
  {
-       struct si_screen *sscreen = (struct si_screen*)ctx->screen;
-       struct radeon_winsys *ws = sscreen->ws;
-       struct si_multi_fence *sfence;
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+   struct radeon_winsys *ws = sscreen->ws;
+   struct si_multi_fence *sfence;
  
-       *pfence = NULL;
+   *pfence = NULL;
  
-       sfence = si_create_multi_fence();
-       if (!sfence)
-               return;
+   sfence = si_create_multi_fence();
+   if (!sfence)
+      return;
  
-       switch (type) {
-       case PIPE_FD_TYPE_NATIVE_SYNC:
-               if (!sscreen->info.has_fence_to_handle)
-                       goto finish;
+   switch (type) {
+   case PIPE_FD_TYPE_NATIVE_SYNC:
+      if (!sscreen->info.has_fence_to_handle)
+         goto finish;
  
-               sfence->gfx = ws->fence_import_sync_file(ws, fd);
-               break;
+      sfence->gfx = ws->fence_import_sync_file(ws, fd);
+      break;
  
-       case PIPE_FD_TYPE_SYNCOBJ:
-               if (!sscreen->info.has_syncobj)
-                       goto finish;
+   case PIPE_FD_TYPE_SYNCOBJ:
+      if (!sscreen->info.has_syncobj)
+         goto finish;
  
-               sfence->gfx = ws->fence_import_syncobj(ws, fd);
-               break;
+      sfence->gfx = ws->fence_import_syncobj(ws, fd);
+      break;
  
-       default:
-               unreachable("bad fence fd type when importing");
-       }
+   default:
+      unreachable("bad fence fd type when importing");
+   }
  
  finish:
-       if (!sfence->gfx) {
-               FREE(sfence);
-               return;
-       }
+   if (!sfence->gfx) {
+      FREE(sfence);
+      return;
+   }
  
-       *pfence = (struct pipe_fence_handle*)sfence;
+   *pfence = (struct pipe_fence_handle *)sfence;
  }
  
-static int si_fence_get_fd(struct pipe_screen *screen,
-                          struct pipe_fence_handle *fence)
+static int si_fence_get_fd(struct pipe_screen *screen, struct pipe_fence_handle *fence)
  {
-       struct si_screen *sscreen = (struct si_screen*)screen;
-       struct radeon_winsys *ws = sscreen->ws;
-       struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
-       int gfx_fd = -1, sdma_fd = -1;
-
-       if (!sscreen->info.has_fence_to_handle)
-               return -1;
-
-       util_queue_fence_wait(&sfence->ready);
-
-       /* Deferred fences aren't supported. */
-       assert(!sfence->gfx_unflushed.ctx);
-       if (sfence->gfx_unflushed.ctx)
-               return -1;
-
-       if (sfence->sdma) {
-               sdma_fd = ws->fence_export_sync_file(ws, sfence->sdma);
-               if (sdma_fd == -1)
-                       return -1;
-       }
-       if (sfence->gfx) {
-               gfx_fd = ws->fence_export_sync_file(ws, sfence->gfx);
-               if (gfx_fd == -1) {
-                       if (sdma_fd != -1)
-                               close(sdma_fd);
-                       return -1;
-               }
-       }
-
-       /* If we don't have FDs at this point, it means we don't have fences
-        * either. */
-       if (sdma_fd == -1 && gfx_fd == -1)
-               return ws->export_signalled_sync_file(ws);
-       if (sdma_fd == -1)
-               return gfx_fd;
-       if (gfx_fd == -1)
-               return sdma_fd;
-
-       /* Get a fence that will be a combination of both fences. */
-       sync_accumulate("radeonsi", &gfx_fd, sdma_fd);
-       close(sdma_fd);
-       return gfx_fd;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct radeon_winsys *ws = sscreen->ws;
+   struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+   int gfx_fd = -1, sdma_fd = -1;
+
+   if (!sscreen->info.has_fence_to_handle)
+      return -1;
+
+   util_queue_fence_wait(&sfence->ready);
+
+   /* Deferred fences aren't supported. */
+   assert(!sfence->gfx_unflushed.ctx);
+   if (sfence->gfx_unflushed.ctx)
+      return -1;
+
+   if (sfence->sdma) {
+      sdma_fd = ws->fence_export_sync_file(ws, sfence->sdma);
+      if (sdma_fd == -1)
+         return -1;
+   }
+   if (sfence->gfx) {
+      gfx_fd = ws->fence_export_sync_file(ws, sfence->gfx);
+      if (gfx_fd == -1) {
+         if (sdma_fd != -1)
+            close(sdma_fd);
+         return -1;
+      }
+   }
+
+   /* If we don't have FDs at this point, it means we don't have fences
+    * either. */
+   if (sdma_fd == -1 && gfx_fd == -1)
+      return ws->export_signalled_sync_file(ws);
+   if (sdma_fd == -1)
+      return gfx_fd;
+   if (gfx_fd == -1)
+      return sdma_fd;
+
+   /* Get a fence that will be a combination of both fences. */
+   sync_accumulate("radeonsi", &gfx_fd, sdma_fd);
+   close(sdma_fd);
+   return gfx_fd;
  }
  
-static void si_flush_from_st(struct pipe_context *ctx,
-                            struct pipe_fence_handle **fence,
-                            unsigned flags)
+static void si_flush_from_st(struct pipe_context *ctx, struct pipe_fence_handle **fence,
+                             unsigned flags)
  {
-       struct pipe_screen *screen = ctx->screen;
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct radeon_winsys *ws = sctx->ws;
-       struct pipe_fence_handle *gfx_fence = NULL;
-       struct pipe_fence_handle *sdma_fence = NULL;
-       bool deferred_fence = false;
-       struct si_fine_fence fine = {};
-       unsigned rflags = PIPE_FLUSH_ASYNC;
-
-       if (flags & PIPE_FLUSH_END_OF_FRAME)
-               rflags |= PIPE_FLUSH_END_OF_FRAME;
-
-       if (flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) {
-               assert(flags & PIPE_FLUSH_DEFERRED);
-               assert(fence);
-
-               si_fine_fence_set(sctx, &fine, flags);
-       }
-
-       /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */
-       if (sctx->sdma_cs)
-               si_flush_dma_cs(sctx, rflags, fence ? &sdma_fence : NULL);
-
-       if (!radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size)) {
-               if (fence)
-                       ws->fence_reference(&gfx_fence, sctx->last_gfx_fence);
-               if (!(flags & PIPE_FLUSH_DEFERRED))
-                       ws->cs_sync_flush(sctx->gfx_cs);
-       } else {
-               /* Instead of flushing, create a deferred fence. Constraints:
-                * - The state tracker must allow a deferred flush.
-                * - The state tracker must request a fence.
-                * - fence_get_fd is not allowed.
-                * Thread safety in fence_finish must be ensured by the state tracker.
-                */
-               if (flags & PIPE_FLUSH_DEFERRED &&
-                   !(flags & PIPE_FLUSH_FENCE_FD) &&
-                   fence) {
-                       gfx_fence = sctx->ws->cs_get_next_fence(sctx->gfx_cs);
-                       deferred_fence = true;
-               } else {
-                       si_flush_gfx_cs(sctx, rflags, fence ? &gfx_fence : NULL);
-               }
-       }
-
-       /* Both engines can signal out of order, so we need to keep both fences. */
-       if (fence) {
-               struct si_multi_fence *multi_fence;
-
-               if (flags & TC_FLUSH_ASYNC) {
-                       multi_fence = (struct si_multi_fence *)*fence;
-                       assert(multi_fence);
-               } else {
-                       multi_fence = si_create_multi_fence();
-                       if (!multi_fence) {
-                               ws->fence_reference(&sdma_fence, NULL);
-                               ws->fence_reference(&gfx_fence, NULL);
-                               goto finish;
-                       }
-
-                       screen->fence_reference(screen, fence, NULL);
-                       *fence = (struct pipe_fence_handle*)multi_fence;
-               }
-
-               /* If both fences are NULL, fence_finish will always return true. */
-               multi_fence->gfx = gfx_fence;
-               multi_fence->sdma = sdma_fence;
-
-               if (deferred_fence) {
-                       multi_fence->gfx_unflushed.ctx = sctx;
-                       multi_fence->gfx_unflushed.ib_index = sctx->num_gfx_cs_flushes;
-               }
-
-               multi_fence->fine = fine;
-               fine.buf = NULL;
-
-               if (flags & TC_FLUSH_ASYNC) {
-                       util_queue_fence_signal(&multi_fence->ready);
-                       tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL);
-               }
-       }
-       assert(!fine.buf);
+   struct pipe_screen *screen = ctx->screen;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct radeon_winsys *ws = sctx->ws;
+   struct pipe_fence_handle *gfx_fence = NULL;
+   struct pipe_fence_handle *sdma_fence = NULL;
+   bool deferred_fence = false;
+   struct si_fine_fence fine = {};
+   unsigned rflags = PIPE_FLUSH_ASYNC;
+
+   if (flags & PIPE_FLUSH_END_OF_FRAME)
+      rflags |= PIPE_FLUSH_END_OF_FRAME;
+
+   if (flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) {
+      assert(flags & PIPE_FLUSH_DEFERRED);
+      assert(fence);
+
+      si_fine_fence_set(sctx, &fine, flags);
+   }
+
+   /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */
+   if (sctx->sdma_cs)
+      si_flush_dma_cs(sctx, rflags, fence ? &sdma_fence : NULL);
+
+   if (!radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size)) {
+      if (fence)
+         ws->fence_reference(&gfx_fence, sctx->last_gfx_fence);
+      if (!(flags & PIPE_FLUSH_DEFERRED))
+         ws->cs_sync_flush(sctx->gfx_cs);
+   } else {
+      /* Instead of flushing, create a deferred fence. Constraints:
+       * - The state tracker must allow a deferred flush.
+       * - The state tracker must request a fence.
+       * - fence_get_fd is not allowed.
+       * Thread safety in fence_finish must be ensured by the state tracker.
+       */
+      if (flags & PIPE_FLUSH_DEFERRED && !(flags & PIPE_FLUSH_FENCE_FD) && fence) {
+         gfx_fence = sctx->ws->cs_get_next_fence(sctx->gfx_cs);
+         deferred_fence = true;
+      } else {
+         si_flush_gfx_cs(sctx, rflags, fence ? &gfx_fence : NULL);
+      }
+   }
+
+   /* Both engines can signal out of order, so we need to keep both fences. */
+   if (fence) {
+      struct si_multi_fence *multi_fence;
+
+      if (flags & TC_FLUSH_ASYNC) {
+         multi_fence = (struct si_multi_fence *)*fence;
+         assert(multi_fence);
+      } else {
+         multi_fence = si_create_multi_fence();
+         if (!multi_fence) {
+            ws->fence_reference(&sdma_fence, NULL);
+            ws->fence_reference(&gfx_fence, NULL);
+            goto finish;
+         }
+
+         screen->fence_reference(screen, fence, NULL);
+         *fence = (struct pipe_fence_handle *)multi_fence;
+      }
+
+      /* If both fences are NULL, fence_finish will always return true. */
+      multi_fence->gfx = gfx_fence;
+      multi_fence->sdma = sdma_fence;
+
+      if (deferred_fence) {
+         multi_fence->gfx_unflushed.ctx = sctx;
+         multi_fence->gfx_unflushed.ib_index = sctx->num_gfx_cs_flushes;
+      }
+
+      multi_fence->fine = fine;
+      fine.buf = NULL;
+
+      if (flags & TC_FLUSH_ASYNC) {
+         util_queue_fence_signal(&multi_fence->ready);
+         tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL);
+      }
+   }
+   assert(!fine.buf);
  finish:
-       if (!(flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC))) {
-               if (sctx->sdma_cs)
-                       ws->cs_sync_flush(sctx->sdma_cs);
-               ws->cs_sync_flush(sctx->gfx_cs);
-       }
+   if (!(flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC))) {
+      if (sctx->sdma_cs)
+         ws->cs_sync_flush(sctx->sdma_cs);
+      ws->cs_sync_flush(sctx->gfx_cs);
+   }
  }
  
-static void si_fence_server_signal(struct pipe_context *ctx,
-                                  struct pipe_fence_handle *fence)
+static void si_fence_server_signal(struct pipe_context *ctx, struct pipe_fence_handle *fence)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
-
-       /* We should have at least one syncobj to signal */
-       assert(sfence->sdma || sfence->gfx);
-
-       if (sfence->sdma)
-               si_add_syncobj_signal(sctx, sfence->sdma);
-       if (sfence->gfx)
-               si_add_syncobj_signal(sctx, sfence->gfx);
-
-       /**
-        * The spec does not require a flush here. We insert a flush
-        * because syncobj based signals are not directly placed into
-        * the command stream. Instead the signal happens when the
-        * submission associated with the syncobj finishes execution.
-        *
-        * Therefore, we must make sure that we flush the pipe to avoid
-        * new work being emitted and getting executed before the signal
-        * operation.
-        * 
-        * Set sctx->initial_gfx_cs_size to force IB submission even if
-        * it is empty.
-        */
-       sctx->initial_gfx_cs_size = 0;
-       si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+
+   /* We should have at least one syncobj to signal */
+   assert(sfence->sdma || sfence->gfx);
+
+   if (sfence->sdma)
+      si_add_syncobj_signal(sctx, sfence->sdma);
+   if (sfence->gfx)
+      si_add_syncobj_signal(sctx, sfence->gfx);
+
+   /**
+    * The spec does not require a flush here. We insert a flush
+    * because syncobj based signals are not directly placed into
+    * the command stream. Instead the signal happens when the
+    * submission associated with the syncobj finishes execution.
+    *
+    * Therefore, we must make sure that we flush the pipe to avoid
+    * new work being emitted and getting executed before the signal
+    * operation.
+    *
+    * Set sctx->initial_gfx_cs_size to force IB submission even if
+    * it is empty.
+    */
+   sctx->initial_gfx_cs_size = 0;
+   si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
  }
  
-static void si_fence_server_sync(struct pipe_context *ctx,
-                                struct pipe_fence_handle *fence)
+static void si_fence_server_sync(struct pipe_context *ctx, struct pipe_fence_handle *fence)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
-
-       util_queue_fence_wait(&sfence->ready);
-
-       /* Unflushed fences from the same context are no-ops. */
-       if (sfence->gfx_unflushed.ctx &&
-           sfence->gfx_unflushed.ctx == sctx)
-               return;
-
-       /* All unflushed commands will not start execution before
-        * this fence dependency is signalled.
-        *
-        * Therefore we must flush before inserting the dependency
-        */
-       si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
-
-       if (sfence->sdma)
-               si_add_fence_dependency(sctx, sfence->sdma);
-       if (sfence->gfx)
-               si_add_fence_dependency(sctx, sfence->gfx);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+
+   util_queue_fence_wait(&sfence->ready);
+
+   /* Unflushed fences from the same context are no-ops. */
+   if (sfence->gfx_unflushed.ctx && sfence->gfx_unflushed.ctx == sctx)
+      return;
+
+   /* All unflushed commands will not start execution before
+    * this fence dependency is signalled.
+    *
+    * Therefore we must flush before inserting the dependency
+    */
+   si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
+
+   if (sfence->sdma)
+      si_add_fence_dependency(sctx, sfence->sdma);
+   if (sfence->gfx)
+      si_add_fence_dependency(sctx, sfence->gfx);
  }
  
  void si_init_fence_functions(struct si_context *ctx)
  {
-       ctx->b.flush = si_flush_from_st;
-       ctx->b.create_fence_fd = si_create_fence_fd;
-       ctx->b.fence_server_sync = si_fence_server_sync;
-       ctx->b.fence_server_signal = si_fence_server_signal;
+   ctx->b.flush = si_flush_from_st;
+   ctx->b.create_fence_fd = si_create_fence_fd;
+   ctx->b.fence_server_sync = si_fence_server_sync;
+   ctx->b.fence_server_signal = si_fence_server_signal;
  }
  
  void si_init_screen_fence_functions(struct si_screen *screen)
  {
-       screen->b.fence_finish = si_fence_finish;
-       screen->b.fence_reference = si_fence_reference;
-       screen->b.fence_get_fd = si_fence_get_fd;
+   screen->b.fence_finish = si_fence_finish;
+   screen->b.fence_reference = si_fence_reference;
+   screen->b.fence_get_fd = si_fence_get_fd;
  }
diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c

index f0a00b17e7efc1c35f53b79f99d0ebce8a35cd30..2a4a23cec1386894a961b2d8ae5d4f143215074b 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -22,981 +22,947 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "si_pipe.h"
-#include "radeon/radeon_video.h"
-#include "radeon/radeon_vce.h"
+#include "compiler/nir/nir.h"
  #include "radeon/radeon_uvd_enc.h"
-#include "vl/vl_decoder.h"
-#include "vl/vl_video_buffer.h"
+#include "radeon/radeon_vce.h"
+#include "radeon/radeon_video.h"
+#include "si_pipe.h"
  #include "util/u_screen.h"
  #include "util/u_video.h"
-#include "compiler/nir/nir.h"
-
+#include "vl/vl_decoder.h"
+#include "vl/vl_video_buffer.h"
  #include <sys/utsname.h>
  
  static const char *si_get_vendor(struct pipe_screen *pscreen)
  {
-       /* Don't change this. Games such as Alien Isolation are broken if this
-        * returns "Advanced Micro Devices, Inc."
-        */
-       return "X.Org";
+   /* Don't change this. Games such as Alien Isolation are broken if this
+    * returns "Advanced Micro Devices, Inc."
+    */
+   return "X.Org";
  }
  
  static const char *si_get_device_vendor(struct pipe_screen *pscreen)
  {
-       return "AMD";
+   return "AMD";
  }
  
  static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
  {
-       struct si_screen *sscreen = (struct si_screen *)pscreen;
-
-       switch (param) {
-       /* Supported features (boolean caps). */
-       case PIPE_CAP_ACCELERATED:
-       case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
-       case PIPE_CAP_ANISOTROPIC_FILTER:
-       case PIPE_CAP_POINT_SPRITE:
-       case PIPE_CAP_OCCLUSION_QUERY:
-       case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
-       case PIPE_CAP_TEXTURE_SHADOW_LOD:
-       case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE:
-       case PIPE_CAP_BLEND_EQUATION_SEPARATE:
-       case PIPE_CAP_TEXTURE_SWIZZLE:
-       case PIPE_CAP_DEPTH_CLIP_DISABLE:
-       case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
-       case PIPE_CAP_SHADER_STENCIL_EXPORT:
-       case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
-       case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
-       case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
-       case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
-       case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
-       case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD:
-       case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES:
-       case PIPE_CAP_VERTEX_SHADER_SATURATE:
-       case PIPE_CAP_SEAMLESS_CUBE_MAP:
-       case PIPE_CAP_PRIMITIVE_RESTART:
-       case PIPE_CAP_CONDITIONAL_RENDER:
-       case PIPE_CAP_TEXTURE_BARRIER:
-       case PIPE_CAP_INDEP_BLEND_ENABLE:
-       case PIPE_CAP_INDEP_BLEND_FUNC:
-       case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
-       case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
-       case PIPE_CAP_START_INSTANCE:
-       case PIPE_CAP_NPOT_TEXTURES:
-       case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
-       case PIPE_CAP_MIXED_COLOR_DEPTH_BITS:
-       case PIPE_CAP_VERTEX_COLOR_CLAMPED:
-       case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
-       case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
-       case PIPE_CAP_TGSI_INSTANCEID:
-       case PIPE_CAP_COMPUTE:
-       case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
-       case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
-       case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
-       case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
-       case PIPE_CAP_CUBE_MAP_ARRAY:
-       case PIPE_CAP_SAMPLE_SHADING:
-       case PIPE_CAP_DRAW_INDIRECT:
-       case PIPE_CAP_CLIP_HALFZ:
-       case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
-       case PIPE_CAP_POLYGON_OFFSET_CLAMP:
-       case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
-       case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
-       case PIPE_CAP_TGSI_TEXCOORD:
-       case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
-       case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
-       case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
-       case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
-       case PIPE_CAP_SHAREABLE_SHADERS:
-       case PIPE_CAP_DEPTH_BOUNDS_TEST:
-       case PIPE_CAP_SAMPLER_VIEW_TARGET:
-       case PIPE_CAP_TEXTURE_QUERY_LOD:
-       case PIPE_CAP_TEXTURE_GATHER_SM5:
-       case PIPE_CAP_TGSI_TXQS:
-       case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
-       case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
-       case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
-       case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
-       case PIPE_CAP_INVALIDATE_BUFFER:
-       case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
-       case PIPE_CAP_QUERY_BUFFER_OBJECT:
-       case PIPE_CAP_QUERY_MEMORY_INFO:
-       case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
-       case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
-       case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
-       case PIPE_CAP_GENERATE_MIPMAP:
-       case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
-       case PIPE_CAP_STRING_MARKER:
-       case PIPE_CAP_CLEAR_TEXTURE:
-       case PIPE_CAP_CULL_DISTANCE:
-       case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
-       case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
-       case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
-       case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
-       case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
-       case PIPE_CAP_DOUBLES:
-       case PIPE_CAP_TGSI_TEX_TXF_LZ:
-       case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
-       case PIPE_CAP_BINDLESS_TEXTURE:
-       case PIPE_CAP_QUERY_TIMESTAMP:
-       case PIPE_CAP_QUERY_TIME_ELAPSED:
-       case PIPE_CAP_NIR_SAMPLERS_AS_DEREF:
-       case PIPE_CAP_MEMOBJ:
-       case PIPE_CAP_LOAD_CONSTBUF:
-       case PIPE_CAP_INT64:
-       case PIPE_CAP_INT64_DIVMOD:
-       case PIPE_CAP_TGSI_CLOCK:
-       case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
-       case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
-       case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
-       case PIPE_CAP_TGSI_BALLOT:
-       case PIPE_CAP_TGSI_VOTE:
-       case PIPE_CAP_FBFETCH:
-       case PIPE_CAP_COMPUTE_GRID_INFO_LAST_BLOCK:
-       case PIPE_CAP_IMAGE_LOAD_FORMATTED:
-       case PIPE_CAP_PREFER_COMPUTE_FOR_MULTIMEDIA:
-       case PIPE_CAP_TGSI_DIV:
-       case PIPE_CAP_PACKED_UNIFORMS:
-       case PIPE_CAP_SHADER_SAMPLES_IDENTICAL:
-       case PIPE_CAP_GL_SPIRV:
-       case PIPE_CAP_DRAW_INFO_START_WITH_USER_INDICES:
-               return 1;
-
-       case PIPE_CAP_QUERY_SO_OVERFLOW:
-               return !sscreen->use_ngg_streamout;
-
-       case PIPE_CAP_POST_DEPTH_COVERAGE:
-               return sscreen->info.chip_class >= GFX10;
-
-       case PIPE_CAP_GRAPHICS:
-               return sscreen->info.has_graphics;
-
-       case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
-               return !SI_BIG_ENDIAN && sscreen->info.has_userptr;
-
-       case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-               return sscreen->info.has_gpu_reset_status_query;
-
-       case PIPE_CAP_TEXTURE_MULTISAMPLE:
-               return sscreen->info.has_2d_tiling;
-
-        case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
-                return SI_MAP_BUFFER_ALIGNMENT;
-
-       case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
-       case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
-       case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
-       case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
-       case PIPE_CAP_MAX_VERTEX_STREAMS:
-       case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
-       case PIPE_CAP_MAX_WINDOW_RECTANGLES:
-               return 4;
-
-       case PIPE_CAP_GLSL_FEATURE_LEVEL:
-       case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
-               if (!sscreen->info.has_indirect_compute_dispatch)
-                       return 420;
-               return 460;
-
-       case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
-               /* Optimal number for good TexSubImage performance on Polaris10. */
-               return 64 * 1024 * 1024;
-
-       case PIPE_CAP_GL_BEGIN_END_BUFFER_SIZE:
-               return 4096 * 1024;
-
-       case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
-       case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
-               return MIN2(sscreen->info.max_alloc_size, INT_MAX);
-
-       case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
-       case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
-       case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
-               return LLVM_VERSION_MAJOR < 9 && !sscreen->info.has_unaligned_shader_loads;
-
-       case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
-               return sscreen->info.has_sparse_vm_mappings ?
-                               RADEON_SPARSE_PAGE_SIZE : 0;
-
-
-       case PIPE_CAP_UMA:
-               return 0;
-
-       case PIPE_CAP_FENCE_SIGNAL:
-               return sscreen->info.has_syncobj;
-
-       case PIPE_CAP_CONSTBUF0_FLAGS:
-               return SI_RESOURCE_FLAG_32BIT;
-
-       case PIPE_CAP_NATIVE_FENCE_FD:
-               return sscreen->info.has_fence_to_handle;
-
-       case PIPE_CAP_DRAW_PARAMETERS:
-       case PIPE_CAP_MULTI_DRAW_INDIRECT:
-       case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
-               return sscreen->has_draw_indirect_multi;
-
-       case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
-               return 30;
-
-       case PIPE_CAP_MAX_VARYINGS:
-               return 32;
-
-       case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
-               return sscreen->info.chip_class <= GFX8 ?
-                       PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0;
-
-       /* Stream output. */
-       case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
-       case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
-               return 32*4;
-
-       /* Geometry shader output. */
-       case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
-               /* gfx9 has to report 256 to make piglit/gs-max-output pass.
-                * gfx8 and earlier can do 1024.
-                */
-               return 256;
-       case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
-               return 4095;
-       case PIPE_CAP_MAX_GS_INVOCATIONS:
-               /* The closed driver exposes 127, but 125 is the greatest
-                * number that works. */
-               return 125;
-
-       case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
-               return 2048;
-
-       /* Texturing. */
-       case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
-               return 16384;
-       case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-               return 15; /* 16384 */
-       case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-               if (sscreen->info.chip_class >= GFX10)
-                       return 14;
-               /* textures support 8192, but layered rendering supports 2048 */
-               return 12;
-       case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-               if (sscreen->info.chip_class >= GFX10)
-                       return 8192;
-               /* textures support 8192, but layered rendering supports 2048 */
-               return 2048;
-
-       /* Viewports and render targets. */
-       case PIPE_CAP_MAX_VIEWPORTS:
-               return SI_MAX_VIEWPORTS;
-       case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
-       case PIPE_CAP_RASTERIZER_SUBPIXEL_BITS:
-       case PIPE_CAP_MAX_RENDER_TARGETS:
-               return 8;
-       case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
-               return sscreen->info.has_eqaa_surface_allocator ? 2 : 0;
-
-       case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
-       case PIPE_CAP_MIN_TEXEL_OFFSET:
-               return -32;
-
-       case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
-       case PIPE_CAP_MAX_TEXEL_OFFSET:
-               return 31;
-
-       case PIPE_CAP_ENDIANNESS:
-               return PIPE_ENDIAN_LITTLE;
-
-       case PIPE_CAP_VENDOR_ID:
-               return ATI_VENDOR_ID;
-       case PIPE_CAP_DEVICE_ID:
-               return sscreen->info.pci_id;
-       case PIPE_CAP_VIDEO_MEMORY:
-               return sscreen->info.vram_size >> 20;
-       case PIPE_CAP_PCI_GROUP:
-               return sscreen->info.pci_domain;
-       case PIPE_CAP_PCI_BUS:
-               return sscreen->info.pci_bus;
-       case PIPE_CAP_PCI_DEVICE:
-               return sscreen->info.pci_dev;
-       case PIPE_CAP_PCI_FUNCTION:
-               return sscreen->info.pci_func;
-       case PIPE_CAP_TGSI_ATOMINC_WRAP:
-               return LLVM_VERSION_MAJOR >= 10;
-
-       default:
-               return u_pipe_screen_get_param_defaults(pscreen, param);
-       }
+   struct si_screen *sscreen = (struct si_screen *)pscreen;
+
+   switch (param) {
+   /* Supported features (boolean caps). */
+   case PIPE_CAP_ACCELERATED:
+   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+   case PIPE_CAP_POINT_SPRITE:
+   case PIPE_CAP_OCCLUSION_QUERY:
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+   case PIPE_CAP_TEXTURE_SHADOW_LOD:
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+   case PIPE_CAP_TEXTURE_SWIZZLE:
+   case PIPE_CAP_DEPTH_CLIP_DISABLE:
+   case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
+   case PIPE_CAP_SHADER_STENCIL_EXPORT:
+   case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+   case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+   case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD:
+   case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES:
+   case PIPE_CAP_VERTEX_SHADER_SATURATE:
+   case PIPE_CAP_SEAMLESS_CUBE_MAP:
+   case PIPE_CAP_PRIMITIVE_RESTART:
+   case PIPE_CAP_CONDITIONAL_RENDER:
+   case PIPE_CAP_TEXTURE_BARRIER:
+   case PIPE_CAP_INDEP_BLEND_ENABLE:
+   case PIPE_CAP_INDEP_BLEND_FUNC:
+   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+   case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
+   case PIPE_CAP_START_INSTANCE:
+   case PIPE_CAP_NPOT_TEXTURES:
+   case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
+   case PIPE_CAP_MIXED_COLOR_DEPTH_BITS:
+   case PIPE_CAP_VERTEX_COLOR_CLAMPED:
+   case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
+   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+   case PIPE_CAP_TGSI_INSTANCEID:
+   case PIPE_CAP_COMPUTE:
+   case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+   case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
+   case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+   case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
+   case PIPE_CAP_CUBE_MAP_ARRAY:
+   case PIPE_CAP_SAMPLE_SHADING:
+   case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_CLIP_HALFZ:
+   case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
+   case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+   case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+   case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+   case PIPE_CAP_TGSI_TEXCOORD:
+   case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+   case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
+   case PIPE_CAP_SAMPLER_VIEW_TARGET:
+   case PIPE_CAP_TEXTURE_QUERY_LOD:
+   case PIPE_CAP_TEXTURE_GATHER_SM5:
+   case PIPE_CAP_TGSI_TXQS:
+   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
+   case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
+   case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_CULL_DISTANCE:
+   case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
+   case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
+   case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
+   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+   case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
+   case PIPE_CAP_DOUBLES:
+   case PIPE_CAP_TGSI_TEX_TXF_LZ:
+   case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+   case PIPE_CAP_BINDLESS_TEXTURE:
+   case PIPE_CAP_QUERY_TIMESTAMP:
+   case PIPE_CAP_QUERY_TIME_ELAPSED:
+   case PIPE_CAP_NIR_SAMPLERS_AS_DEREF:
+   case PIPE_CAP_MEMOBJ:
+   case PIPE_CAP_LOAD_CONSTBUF:
+   case PIPE_CAP_INT64:
+   case PIPE_CAP_INT64_DIVMOD:
+   case PIPE_CAP_TGSI_CLOCK:
+   case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+   case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+   case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
+   case PIPE_CAP_TGSI_BALLOT:
+   case PIPE_CAP_TGSI_VOTE:
+   case PIPE_CAP_FBFETCH:
+   case PIPE_CAP_COMPUTE_GRID_INFO_LAST_BLOCK:
+   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
+   case PIPE_CAP_PREFER_COMPUTE_FOR_MULTIMEDIA:
+   case PIPE_CAP_TGSI_DIV:
+   case PIPE_CAP_PACKED_UNIFORMS:
+   case PIPE_CAP_SHADER_SAMPLES_IDENTICAL:
+   case PIPE_CAP_GL_SPIRV:
+   case PIPE_CAP_DRAW_INFO_START_WITH_USER_INDICES:
+      return 1;
+
+   case PIPE_CAP_QUERY_SO_OVERFLOW:
+      return !sscreen->use_ngg_streamout;
+
+   case PIPE_CAP_POST_DEPTH_COVERAGE:
+      return sscreen->info.chip_class >= GFX10;
+
+   case PIPE_CAP_GRAPHICS:
+      return sscreen->info.has_graphics;
+
+   case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+      return !SI_BIG_ENDIAN && sscreen->info.has_userptr;
+
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+      return sscreen->info.has_gpu_reset_status_query;
+
+   case PIPE_CAP_TEXTURE_MULTISAMPLE:
+      return sscreen->info.has_2d_tiling;
+
+   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+      return SI_MAP_BUFFER_ALIGNMENT;
+
+   case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
+   case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+   case PIPE_CAP_MAX_VERTEX_STREAMS:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_MAX_WINDOW_RECTANGLES:
+      return 4;
+
+   case PIPE_CAP_GLSL_FEATURE_LEVEL:
+   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+      if (!sscreen->info.has_indirect_compute_dispatch)
+         return 420;
+      return 460;
+
+   case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
+      /* Optimal number for good TexSubImage performance on Polaris10. */
+      return 64 * 1024 * 1024;
+
+   case PIPE_CAP_GL_BEGIN_END_BUFFER_SIZE:
+      return 4096 * 1024;
+
+   case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+   case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
+      return MIN2(sscreen->info.max_alloc_size, INT_MAX);
+
+   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+      return LLVM_VERSION_MAJOR < 9 && !sscreen->info.has_unaligned_shader_loads;
+
+   case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
+      return sscreen->info.has_sparse_vm_mappings ? RADEON_SPARSE_PAGE_SIZE : 0;
+
+   case PIPE_CAP_UMA:
+      return 0;
+
+   case PIPE_CAP_FENCE_SIGNAL:
+      return sscreen->info.has_syncobj;
+
+   case PIPE_CAP_CONSTBUF0_FLAGS:
+      return SI_RESOURCE_FLAG_32BIT;
+
+   case PIPE_CAP_NATIVE_FENCE_FD:
+      return sscreen->info.has_fence_to_handle;
+
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+      return sscreen->has_draw_indirect_multi;
+
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+      return 30;
+
+   case PIPE_CAP_MAX_VARYINGS:
+      return 32;
+
+   case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
+      return sscreen->info.chip_class <= GFX8 ? PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0;
+
+   /* Stream output. */
+   case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+   case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+      return 32 * 4;
+
+   /* Geometry shader output. */
+   case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
+      /* gfx9 has to report 256 to make piglit/gs-max-output pass.
+       * gfx8 and earlier can do 1024.
+       */
+      return 256;
+   case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
+      return 4095;
+   case PIPE_CAP_MAX_GS_INVOCATIONS:
+      /* The closed driver exposes 127, but 125 is the greatest
+       * number that works. */
+      return 125;
+
+   case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
+      return 2048;
+
+   /* Texturing. */
+   case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
+      return 16384;
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 15; /* 16384 */
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      if (sscreen->info.chip_class >= GFX10)
+         return 14;
+      /* textures support 8192, but layered rendering supports 2048 */
+      return 12;
+   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+      if (sscreen->info.chip_class >= GFX10)
+         return 8192;
+      /* textures support 8192, but layered rendering supports 2048 */
+      return 2048;
+
+   /* Viewports and render targets. */
+   case PIPE_CAP_MAX_VIEWPORTS:
+      return SI_MAX_VIEWPORTS;
+   case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
+   case PIPE_CAP_RASTERIZER_SUBPIXEL_BITS:
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 8;
+   case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
+      return sscreen->info.has_eqaa_surface_allocator ? 2 : 0;
+
+   case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
+   case PIPE_CAP_MIN_TEXEL_OFFSET:
+      return -32;
+
+   case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
+   case PIPE_CAP_MAX_TEXEL_OFFSET:
+      return 31;
+
+   case PIPE_CAP_ENDIANNESS:
+      return PIPE_ENDIAN_LITTLE;
+
+   case PIPE_CAP_VENDOR_ID:
+      return ATI_VENDOR_ID;
+   case PIPE_CAP_DEVICE_ID:
+      return sscreen->info.pci_id;
+   case PIPE_CAP_VIDEO_MEMORY:
+      return sscreen->info.vram_size >> 20;
+   case PIPE_CAP_PCI_GROUP:
+      return sscreen->info.pci_domain;
+   case PIPE_CAP_PCI_BUS:
+      return sscreen->info.pci_bus;
+   case PIPE_CAP_PCI_DEVICE:
+      return sscreen->info.pci_dev;
+   case PIPE_CAP_PCI_FUNCTION:
+      return sscreen->info.pci_func;
+   case PIPE_CAP_TGSI_ATOMINC_WRAP:
+      return LLVM_VERSION_MAJOR >= 10;
+
+   default:
+      return u_pipe_screen_get_param_defaults(pscreen, param);
+   }
  }
  
-static float si_get_paramf(struct pipe_screen* pscreen, enum pipe_capf param)
+static float si_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
  {
-       switch (param) {
-       case PIPE_CAPF_MAX_LINE_WIDTH:
-       case PIPE_CAPF_MAX_LINE_WIDTH_AA:
-               /* This depends on the quant mode, though the precise interactions
-                * are unknown. */
-               return 2048;
-       case PIPE_CAPF_MAX_POINT_WIDTH:
-       case PIPE_CAPF_MAX_POINT_WIDTH_AA:
-               return SI_MAX_POINT_SIZE;
-       case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
-               return 16.0f;
-       case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
-               return 16.0f;
-       case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
-       case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
-       case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
-               return 0.0f;
-       }
-       return 0.0f;
+   switch (param) {
+   case PIPE_CAPF_MAX_LINE_WIDTH:
+   case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+      /* This depends on the quant mode, though the precise interactions
+       * are unknown. */
+      return 2048;
+   case PIPE_CAPF_MAX_POINT_WIDTH:
+   case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+      return SI_MAX_POINT_SIZE;
+   case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+      return 16.0f;
+   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+      return 16.0f;
+   case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+      return 0.0f;
+   }
+   return 0.0f;
  }
  
-static int si_get_shader_param(struct pipe_screen* pscreen,
-                              enum pipe_shader_type shader,
-                              enum pipe_shader_cap param)
+static int si_get_shader_param(struct pipe_screen *pscreen, enum pipe_shader_type shader,
+                               enum pipe_shader_cap param)
  {
-       struct si_screen *sscreen = (struct si_screen *)pscreen;
-
-       switch(shader)
-       {
-       case PIPE_SHADER_FRAGMENT:
-       case PIPE_SHADER_VERTEX:
-       case PIPE_SHADER_GEOMETRY:
-       case PIPE_SHADER_TESS_CTRL:
-       case PIPE_SHADER_TESS_EVAL:
-               break;
-       case PIPE_SHADER_COMPUTE:
-               switch (param) {
-               case PIPE_SHADER_CAP_SUPPORTED_IRS: {
-                       int ir = 1 << PIPE_SHADER_IR_NATIVE;
-
-                       if (sscreen->info.has_indirect_compute_dispatch)
-                               ir |= 1 << PIPE_SHADER_IR_NIR;
-
-                       return ir;
-               }
-
-               case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: {
-                       uint64_t max_const_buffer_size;
-                       pscreen->get_compute_param(pscreen, PIPE_SHADER_IR_NIR,
-                               PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
-                               &max_const_buffer_size);
-                       return MIN2(max_const_buffer_size, INT_MAX);
-               }
-               default:
-                       /* If compute shaders don't require a special value
-                        * for this cap, we can return the same value we
-                        * do for other shader types. */
-                       break;
-               }
-               break;
-       default:
-               return 0;
-       }
-
-       switch (param) {
-       /* Shader limits. */
-       case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
-       case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
-       case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
-       case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
-       case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
-               return 16384;
-       case PIPE_SHADER_CAP_MAX_INPUTS:
-               return shader == PIPE_SHADER_VERTEX ? SI_MAX_ATTRIBS : 32;
-       case PIPE_SHADER_CAP_MAX_OUTPUTS:
-               return shader == PIPE_SHADER_FRAGMENT ? 8 : 32;
-       case PIPE_SHADER_CAP_MAX_TEMPS:
-               return 256; /* Max native temporaries. */
-       case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
-               return MIN2(sscreen->info.max_alloc_size, INT_MAX - 3); /* aligned to 4 */
-       case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
-               return SI_NUM_CONST_BUFFERS;
-       case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
-       case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
-               return SI_NUM_SAMPLERS;
-       case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
-               return SI_NUM_SHADER_BUFFERS;
-       case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
-               return SI_NUM_IMAGES;
-       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
-               return 0;
-       case PIPE_SHADER_CAP_PREFERRED_IR:
-               return PIPE_SHADER_IR_NIR;
-       case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
-               return 4;
-
-       /* Supported boolean features. */
-       case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
-       case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
-       case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
-       case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
-       case PIPE_SHADER_CAP_INTEGERS:
-       case PIPE_SHADER_CAP_INT64_ATOMICS:
-       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
-       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
-       case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
-       case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
-       case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED:
-       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
-               return 1;
-
-       case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
-               /* TODO: Indirect indexing of GS inputs is unimplemented. */
-               if (shader == PIPE_SHADER_GEOMETRY)
-                       return 0;
-
-               if (shader == PIPE_SHADER_VERTEX &&
-                   !sscreen->llvm_has_working_vgpr_indexing)
-                       return 0;
-
-               /* TCS and TES load inputs directly from LDS or offchip
-                * memory, so indirect indexing is always supported.
-                * PS has to support indirect indexing, because we can't
-                * lower that to TEMPs for INTERP instructions.
-                */
-               return 1;
-
-       case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
-               return sscreen->llvm_has_working_vgpr_indexing ||
-                      /* TCS stores outputs directly to memory. */
-                      shader == PIPE_SHADER_TESS_CTRL;
-
-       /* Unsupported boolean features. */
-       case PIPE_SHADER_CAP_FP16:
-       case PIPE_SHADER_CAP_SUBROUTINES:
-       case PIPE_SHADER_CAP_SUPPORTED_IRS:
-       case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
-       case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
-               return 0;
-       }
-       return 0;
+   struct si_screen *sscreen = (struct si_screen *)pscreen;
+
+   switch (shader) {
+   case PIPE_SHADER_FRAGMENT:
+   case PIPE_SHADER_VERTEX:
+   case PIPE_SHADER_GEOMETRY:
+   case PIPE_SHADER_TESS_CTRL:
+   case PIPE_SHADER_TESS_EVAL:
+      break;
+   case PIPE_SHADER_COMPUTE:
+      switch (param) {
+      case PIPE_SHADER_CAP_SUPPORTED_IRS: {
+         int ir = 1 << PIPE_SHADER_IR_NATIVE;
+
+         if (sscreen->info.has_indirect_compute_dispatch)
+            ir |= 1 << PIPE_SHADER_IR_NIR;
+
+         return ir;
+      }
+
+      case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: {
+         uint64_t max_const_buffer_size;
+         pscreen->get_compute_param(pscreen, PIPE_SHADER_IR_NIR,
+                                    PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE, &max_const_buffer_size);
+         return MIN2(max_const_buffer_size, INT_MAX);
+      }
+      default:
+         /* If compute shaders don't require a special value
+          * for this cap, we can return the same value we
+          * do for other shader types. */
+         break;
+      }
+      break;
+   default:
+      return 0;
+   }
+
+   switch (param) {
+   /* Shader limits. */
+   case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+   case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+      return 16384;
+   case PIPE_SHADER_CAP_MAX_INPUTS:
+      return shader == PIPE_SHADER_VERTEX ? SI_MAX_ATTRIBS : 32;
+   case PIPE_SHADER_CAP_MAX_OUTPUTS:
+      return shader == PIPE_SHADER_FRAGMENT ? 8 : 32;
+   case PIPE_SHADER_CAP_MAX_TEMPS:
+      return 256; /* Max native temporaries. */
+   case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
+      return MIN2(sscreen->info.max_alloc_size, INT_MAX - 3); /* aligned to 4 */
+   case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+      return SI_NUM_CONST_BUFFERS;
+   case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+   case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
+      return SI_NUM_SAMPLERS;
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+      return SI_NUM_SHADER_BUFFERS;
+   case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+      return SI_NUM_IMAGES;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 0;
+   case PIPE_SHADER_CAP_PREFERRED_IR:
+      return PIPE_SHADER_IR_NIR;
+   case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+      return 4;
+
+   /* Supported boolean features. */
+   case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
+   case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+   case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+   case PIPE_SHADER_CAP_INTEGERS:
+   case PIPE_SHADER_CAP_INT64_ATOMICS:
+   case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+   case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
+      return 1;
+
+   case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+      /* TODO: Indirect indexing of GS inputs is unimplemented. */
+      if (shader == PIPE_SHADER_GEOMETRY)
+         return 0;
+
+      if (shader == PIPE_SHADER_VERTEX && !sscreen->llvm_has_working_vgpr_indexing)
+         return 0;
+
+      /* TCS and TES load inputs directly from LDS or offchip
+       * memory, so indirect indexing is always supported.
+       * PS has to support indirect indexing, because we can't
+       * lower that to TEMPs for INTERP instructions.
+       */
+      return 1;
+
+   case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+      return sscreen->llvm_has_working_vgpr_indexing ||
+             /* TCS stores outputs directly to memory. */
+             shader == PIPE_SHADER_TESS_CTRL;
+
+   /* Unsupported boolean features. */
+   case PIPE_SHADER_CAP_FP16:
+   case PIPE_SHADER_CAP_SUBROUTINES:
+   case PIPE_SHADER_CAP_SUPPORTED_IRS:
+   case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
+   case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+      return 0;
+   }
+   return 0;
  }
  
  static const struct nir_shader_compiler_options nir_options = {
-       .lower_scmp = true,
-       .lower_flrp32 = true,
-       .lower_flrp64 = true,
-       .lower_fsat = true,
-       .lower_fdiv = true,
-       .lower_bitfield_insert_to_bitfield_select = true,
-       .lower_bitfield_extract = true,
-       .lower_sub = true,
-       .fuse_ffma = true,
-       .lower_fmod = true,
-       .lower_pack_snorm_4x8 = true,
-       .lower_pack_unorm_4x8 = true,
-       .lower_unpack_snorm_2x16 = true,
-       .lower_unpack_snorm_4x8 = true,
-       .lower_unpack_unorm_2x16 = true,
-       .lower_unpack_unorm_4x8 = true,
-       .lower_extract_byte = true,
-       .lower_extract_word = true,
-       .lower_rotate = true,
-       .lower_to_scalar = true,
-       .optimize_sample_mask_in = true,
-       .max_unroll_iterations = 32,
-       .use_interpolated_input_intrinsics = true,
+   .lower_scmp = true,
+   .lower_flrp32 = true,
+   .lower_flrp64 = true,
+   .lower_fsat = true,
+   .lower_fdiv = true,
+   .lower_bitfield_insert_to_bitfield_select = true,
+   .lower_bitfield_extract = true,
+   .lower_sub = true,
+   .fuse_ffma = true,
+   .lower_fmod = true,
+   .lower_pack_snorm_4x8 = true,
+   .lower_pack_unorm_4x8 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_snorm_4x8 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_unpack_unorm_4x8 = true,
+   .lower_extract_byte = true,
+   .lower_extract_word = true,
+   .lower_rotate = true,
+   .lower_to_scalar = true,
+   .optimize_sample_mask_in = true,
+   .max_unroll_iterations = 32,
+   .use_interpolated_input_intrinsics = true,
  };
  
-static const void *
-si_get_compiler_options(struct pipe_screen *screen,
-                       enum pipe_shader_ir ir,
-                       enum pipe_shader_type shader)
+static const void *si_get_compiler_options(struct pipe_screen *screen, enum pipe_shader_ir ir,
+                                           enum pipe_shader_type shader)
  {
-       assert(ir == PIPE_SHADER_IR_NIR);
-       return &nir_options;
+   assert(ir == PIPE_SHADER_IR_NIR);
+   return &nir_options;
  }
  
  static void si_get_driver_uuid(struct pipe_screen *pscreen, char *uuid)
  {
-       ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE);
+   ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE);
  }
  
  static void si_get_device_uuid(struct pipe_screen *pscreen, char *uuid)
  {
-       struct si_screen *sscreen = (struct si_screen *)pscreen;
+   struct si_screen *sscreen = (struct si_screen *)pscreen;
  
-       ac_compute_device_uuid(&sscreen->info, uuid, PIPE_UUID_SIZE);
+   ac_compute_device_uuid(&sscreen->info, uuid, PIPE_UUID_SIZE);
  }
  
-static const char* si_get_name(struct pipe_screen *pscreen)
+static const char *si_get_name(struct pipe_screen *pscreen)
  {
-       struct si_screen *sscreen = (struct si_screen*)pscreen;
+   struct si_screen *sscreen = (struct si_screen *)pscreen;
  
-       return sscreen->renderer_string;
+   return sscreen->renderer_string;
  }
  
-static int si_get_video_param_no_decode(struct pipe_screen *screen,
-                                       enum pipe_video_profile profile,
-                                       enum pipe_video_entrypoint entrypoint,
-                                       enum pipe_video_cap param)
+static int si_get_video_param_no_decode(struct pipe_screen *screen, enum pipe_video_profile profile,
+                                        enum pipe_video_entrypoint entrypoint,
+                                        enum pipe_video_cap param)
  {
-       switch (param) {
-       case PIPE_VIDEO_CAP_SUPPORTED:
-               return vl_profile_supported(screen, profile, entrypoint);
-       case PIPE_VIDEO_CAP_NPOT_TEXTURES:
-               return 1;
-       case PIPE_VIDEO_CAP_MAX_WIDTH:
-       case PIPE_VIDEO_CAP_MAX_HEIGHT:
-               return vl_video_buffer_max_size(screen);
-       case PIPE_VIDEO_CAP_PREFERED_FORMAT:
-               return PIPE_FORMAT_NV12;
-       case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
-               return false;
-       case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
-               return false;
-       case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
-               return true;
-       case PIPE_VIDEO_CAP_MAX_LEVEL:
-               return vl_level_supported(screen, profile);
-       default:
-               return 0;
-       }
+   switch (param) {
+   case PIPE_VIDEO_CAP_SUPPORTED:
+      return vl_profile_supported(screen, profile, entrypoint);
+   case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_VIDEO_CAP_MAX_WIDTH:
+   case PIPE_VIDEO_CAP_MAX_HEIGHT:
+      return vl_video_buffer_max_size(screen);
+   case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+      return PIPE_FORMAT_NV12;
+   case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+      return false;
+   case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+      return false;
+   case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+      return true;
+   case PIPE_VIDEO_CAP_MAX_LEVEL:
+      return vl_level_supported(screen, profile);
+   default:
+      return 0;
+   }
  }
  
-static int si_get_video_param(struct pipe_screen *screen,
-                             enum pipe_video_profile profile,
-                             enum pipe_video_entrypoint entrypoint,
-                             enum pipe_video_cap param)
+static int si_get_video_param(struct pipe_screen *screen, enum pipe_video_profile profile,
+                              enum pipe_video_entrypoint entrypoint, enum pipe_video_cap param)
  {
-       struct si_screen *sscreen = (struct si_screen *)screen;
-       enum pipe_video_format codec = u_reduce_video_profile(profile);
-
-       if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
-               switch (param) {
-               case PIPE_VIDEO_CAP_SUPPORTED:
-                       return ((codec == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
-                               (sscreen->info.family >= CHIP_RAVEN ||
-                                si_vce_is_fw_version_supported(sscreen))) ||
-                               (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN &&
-                               (sscreen->info.family >= CHIP_RAVEN ||
-                                si_radeon_uvd_enc_supported(sscreen))) ||
-                               (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10 &&
-                                sscreen->info.family >= CHIP_RENOIR));
-               case PIPE_VIDEO_CAP_NPOT_TEXTURES:
-                       return 1;
-               case PIPE_VIDEO_CAP_MAX_WIDTH:
-                       return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
-               case PIPE_VIDEO_CAP_MAX_HEIGHT:
-                       return (sscreen->info.family < CHIP_TONGA) ? 1152 : 2304;
-               case PIPE_VIDEO_CAP_PREFERED_FORMAT:
-                       return PIPE_FORMAT_NV12;
-               case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
-                       return false;
-               case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
-                       return false;
-               case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
-                       return true;
-               case PIPE_VIDEO_CAP_STACKED_FRAMES:
-                       return (sscreen->info.family < CHIP_TONGA) ? 1 : 2;
-               default:
-                       return 0;
-               }
-       }
-
-       switch (param) {
-       case PIPE_VIDEO_CAP_SUPPORTED:
-               switch (codec) {
-               case PIPE_VIDEO_FORMAT_MPEG12:
-                       return profile != PIPE_VIDEO_PROFILE_MPEG1;
-               case PIPE_VIDEO_FORMAT_MPEG4:
-                       return 1;
-               case PIPE_VIDEO_FORMAT_MPEG4_AVC:
-                       if ((sscreen->info.family == CHIP_POLARIS10 ||
-                            sscreen->info.family == CHIP_POLARIS11) &&
-                           sscreen->info.uvd_fw_version < UVD_FW_1_66_16 ) {
-                               RVID_ERR("POLARIS10/11 firmware version need to be updated.\n");
-                               return false;
-                       }
-                       return true;
-               case PIPE_VIDEO_FORMAT_VC1:
-                       return true;
-               case PIPE_VIDEO_FORMAT_HEVC:
-                       /* Carrizo only supports HEVC Main */
-                       if (sscreen->info.family >= CHIP_STONEY)
-                               return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN ||
-                                       profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10);
-                       else if (sscreen->info.family >= CHIP_CARRIZO)
-                               return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
-                       return false;
-               case PIPE_VIDEO_FORMAT_JPEG:
-                       if (sscreen->info.family >= CHIP_RAVEN)
-                               return true;
-                       if (sscreen->info.family < CHIP_CARRIZO || sscreen->info.family >= CHIP_VEGA10)
-                               return false;
-                       if (!(sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 19)) {
-                               RVID_ERR("No MJPEG support for the kernel version\n");
-                               return false;
-                       }
-                       return true;
-               case PIPE_VIDEO_FORMAT_VP9:
-                       if (sscreen->info.family < CHIP_RAVEN)
-                               return false;
-                       return true;
-               default:
-                       return false;
-               }
-       case PIPE_VIDEO_CAP_NPOT_TEXTURES:
-               return 1;
-       case PIPE_VIDEO_CAP_MAX_WIDTH:
-               switch (codec) {
-               case PIPE_VIDEO_FORMAT_HEVC:
-               case PIPE_VIDEO_FORMAT_VP9:
-                       return (sscreen->info.family < CHIP_RENOIR) ?
-                              ((sscreen->info.family < CHIP_TONGA) ? 2048 : 4096) :
-                              8192;
-               default:
-                       return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
-               }
-       case PIPE_VIDEO_CAP_MAX_HEIGHT:
-               switch (codec) {
-               case PIPE_VIDEO_FORMAT_HEVC:
-               case PIPE_VIDEO_FORMAT_VP9:
-                       return (sscreen->info.family < CHIP_RENOIR) ?
-                              ((sscreen->info.family < CHIP_TONGA) ? 1152 : 4096) :
-                              4352;
-               default:
-                       return (sscreen->info.family < CHIP_TONGA) ? 1152 : 4096;
-               }
-       case PIPE_VIDEO_CAP_PREFERED_FORMAT:
-               if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
-                       return PIPE_FORMAT_P010;
-               else if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)
-                       return PIPE_FORMAT_P016;
-               else
-                       return PIPE_FORMAT_NV12;
-
-       case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
-       case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: {
-               enum pipe_video_format format = u_reduce_video_profile(profile);
-
-               if (format == PIPE_VIDEO_FORMAT_HEVC)
-                       return false; //The firmware doesn't support interlaced HEVC.
-               else if (format == PIPE_VIDEO_FORMAT_JPEG)
-                       return false;
-               else if (format == PIPE_VIDEO_FORMAT_VP9)
-                       return false;
-               return true;
-       }
-       case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
-               return true;
-       case PIPE_VIDEO_CAP_MAX_LEVEL:
-               switch (profile) {
-               case PIPE_VIDEO_PROFILE_MPEG1:
-                       return 0;
-               case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE:
-               case PIPE_VIDEO_PROFILE_MPEG2_MAIN:
-                       return 3;
-               case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE:
-                       return 3;
-               case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE:
-                       return 5;
-               case PIPE_VIDEO_PROFILE_VC1_SIMPLE:
-                       return 1;
-               case PIPE_VIDEO_PROFILE_VC1_MAIN:
-                       return 2;
-               case PIPE_VIDEO_PROFILE_VC1_ADVANCED:
-                       return 4;
-               case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
-               case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
-               case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
-                       return (sscreen->info.family < CHIP_TONGA) ? 41 : 52;
-               case PIPE_VIDEO_PROFILE_HEVC_MAIN:
-               case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
-                       return 186;
-               default:
-                       return 0;
-               }
-       default:
-               return 0;
-       }
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   enum pipe_video_format codec = u_reduce_video_profile(profile);
+
+   if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
+      switch (param) {
+      case PIPE_VIDEO_CAP_SUPPORTED:
+         return (
+            (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
+             (sscreen->info.family >= CHIP_RAVEN || si_vce_is_fw_version_supported(sscreen))) ||
+            (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN &&
+             (sscreen->info.family >= CHIP_RAVEN || si_radeon_uvd_enc_supported(sscreen))) ||
+            (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10 && sscreen->info.family >= CHIP_RENOIR));
+      case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+         return 1;
+      case PIPE_VIDEO_CAP_MAX_WIDTH:
+         return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
+      case PIPE_VIDEO_CAP_MAX_HEIGHT:
+         return (sscreen->info.family < CHIP_TONGA) ? 1152 : 2304;
+      case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+         return PIPE_FORMAT_NV12;
+      case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+         return false;
+      case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+         return false;
+      case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+         return true;
+      case PIPE_VIDEO_CAP_STACKED_FRAMES:
+         return (sscreen->info.family < CHIP_TONGA) ? 1 : 2;
+      default:
+         return 0;
+      }
+   }
+
+   switch (param) {
+   case PIPE_VIDEO_CAP_SUPPORTED:
+      switch (codec) {
+      case PIPE_VIDEO_FORMAT_MPEG12:
+         return profile != PIPE_VIDEO_PROFILE_MPEG1;
+      case PIPE_VIDEO_FORMAT_MPEG4:
+         return 1;
+      case PIPE_VIDEO_FORMAT_MPEG4_AVC:
+         if ((sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11) &&
+             sscreen->info.uvd_fw_version < UVD_FW_1_66_16) {
+            RVID_ERR("POLARIS10/11 firmware version need to be updated.\n");
+            return false;
+         }
+         return true;
+      case PIPE_VIDEO_FORMAT_VC1:
+         return true;
+      case PIPE_VIDEO_FORMAT_HEVC:
+         /* Carrizo only supports HEVC Main */
+         if (sscreen->info.family >= CHIP_STONEY)
+            return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN ||
+                    profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10);
+         else if (sscreen->info.family >= CHIP_CARRIZO)
+            return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
+         return false;
+      case PIPE_VIDEO_FORMAT_JPEG:
+         if (sscreen->info.family >= CHIP_RAVEN)
+            return true;
+         if (sscreen->info.family < CHIP_CARRIZO || sscreen->info.family >= CHIP_VEGA10)
+            return false;
+         if (!(sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 19)) {
+            RVID_ERR("No MJPEG support for the kernel version\n");
+            return false;
+         }
+         return true;
+      case PIPE_VIDEO_FORMAT_VP9:
+         if (sscreen->info.family < CHIP_RAVEN)
+            return false;
+         return true;
+      default:
+         return false;
+      }
+   case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_VIDEO_CAP_MAX_WIDTH:
+      switch (codec) {
+      case PIPE_VIDEO_FORMAT_HEVC:
+      case PIPE_VIDEO_FORMAT_VP9:
+         return (sscreen->info.family < CHIP_RENOIR)
+                   ? ((sscreen->info.family < CHIP_TONGA) ? 2048 : 4096)
+                   : 8192;
+      default:
+         return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
+      }
+   case PIPE_VIDEO_CAP_MAX_HEIGHT:
+      switch (codec) {
+      case PIPE_VIDEO_FORMAT_HEVC:
+      case PIPE_VIDEO_FORMAT_VP9:
+         return (sscreen->info.family < CHIP_RENOIR)
+                   ? ((sscreen->info.family < CHIP_TONGA) ? 1152 : 4096)
+                   : 4352;
+      default:
+         return (sscreen->info.family < CHIP_TONGA) ? 1152 : 4096;
+      }
+   case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+      if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+         return PIPE_FORMAT_P010;
+      else if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)
+         return PIPE_FORMAT_P016;
+      else
+         return PIPE_FORMAT_NV12;
+
+   case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+   case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: {
+      enum pipe_video_format format = u_reduce_video_profile(profile);
+
+      if (format == PIPE_VIDEO_FORMAT_HEVC)
+         return false; // The firmware doesn't support interlaced HEVC.
+      else if (format == PIPE_VIDEO_FORMAT_JPEG)
+         return false;
+      else if (format == PIPE_VIDEO_FORMAT_VP9)
+         return false;
+      return true;
+   }
+   case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+      return true;
+   case PIPE_VIDEO_CAP_MAX_LEVEL:
+      switch (profile) {
+      case PIPE_VIDEO_PROFILE_MPEG1:
+         return 0;
+      case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE:
+      case PIPE_VIDEO_PROFILE_MPEG2_MAIN:
+         return 3;
+      case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE:
+         return 3;
+      case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE:
+         return 5;
+      case PIPE_VIDEO_PROFILE_VC1_SIMPLE:
+         return 1;
+      case PIPE_VIDEO_PROFILE_VC1_MAIN:
+         return 2;
+      case PIPE_VIDEO_PROFILE_VC1_ADVANCED:
+         return 4;
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
+         return (sscreen->info.family < CHIP_TONGA) ? 41 : 52;
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
+         return 186;
+      default:
+         return 0;
+      }
+   default:
+      return 0;
+   }
  }
  
-static bool si_vid_is_format_supported(struct pipe_screen *screen,
-                                      enum pipe_format format,
-                                      enum pipe_video_profile profile,
-                                      enum pipe_video_entrypoint entrypoint)
+static bool si_vid_is_format_supported(struct pipe_screen *screen, enum pipe_format format,
+                                       enum pipe_video_profile profile,
+                                       enum pipe_video_entrypoint entrypoint)
  {
-       /* HEVC 10 bit decoding should use P010 instead of NV12 if possible */
-       if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
-               return (format == PIPE_FORMAT_NV12) ||
-                      (format == PIPE_FORMAT_P010) ||
-                      (format == PIPE_FORMAT_P016);
-
-       /* Vp9 profile 2 supports 10 bit decoding using P016 */
-       if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)
-               return format == PIPE_FORMAT_P016;
+   /* HEVC 10 bit decoding should use P010 instead of NV12 if possible */
+   if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+      return (format == PIPE_FORMAT_NV12) || (format == PIPE_FORMAT_P010) ||
+             (format == PIPE_FORMAT_P016);
  
+   /* Vp9 profile 2 supports 10 bit decoding using P016 */
+   if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)
+      return format == PIPE_FORMAT_P016;
  
-       /* we can only handle this one with UVD */
-       if (profile != PIPE_VIDEO_PROFILE_UNKNOWN)
-               return format == PIPE_FORMAT_NV12;
+   /* we can only handle this one with UVD */
+   if (profile != PIPE_VIDEO_PROFILE_UNKNOWN)
+      return format == PIPE_FORMAT_NV12;
  
-       return vl_video_buffer_is_format_supported(screen, format, profile, entrypoint);
+   return vl_video_buffer_is_format_supported(screen, format, profile, entrypoint);
  }
  
-static unsigned get_max_threads_per_block(struct si_screen *screen,
-                                         enum pipe_shader_ir ir_type)
+static unsigned get_max_threads_per_block(struct si_screen *screen, enum pipe_shader_ir ir_type)
  {
-       if (ir_type == PIPE_SHADER_IR_NATIVE)
-               return 256;
+   if (ir_type == PIPE_SHADER_IR_NATIVE)
+      return 256;
  
-        /* LLVM 10 only supports 1024 threads per block. */
-       return 1024;
+   /* LLVM 10 only supports 1024 threads per block. */
+   return 1024;
  }
  
-static int si_get_compute_param(struct pipe_screen *screen,
-                               enum pipe_shader_ir ir_type,
-                               enum pipe_compute_cap param,
-                               void *ret)
+static int si_get_compute_param(struct pipe_screen *screen, enum pipe_shader_ir ir_type,
+                                enum pipe_compute_cap param, void *ret)
  {
-       struct si_screen *sscreen = (struct si_screen *)screen;
-
-       //TODO: select these params by asic
-       switch (param) {
-       case PIPE_COMPUTE_CAP_IR_TARGET: {
-               const char *gpu, *triple;
-
-               triple = "amdgcn-mesa-mesa3d";
-               gpu = ac_get_llvm_processor_name(sscreen->info.family);
-               if (ret) {
-                       sprintf(ret, "%s-%s", gpu, triple);
-               }
-               /* +2 for dash and terminating NIL byte */
-               return (strlen(triple) + strlen(gpu) + 2) * sizeof(char);
-       }
-       case PIPE_COMPUTE_CAP_GRID_DIMENSION:
-               if (ret) {
-                       uint64_t *grid_dimension = ret;
-                       grid_dimension[0] = 3;
-               }
-               return 1 * sizeof(uint64_t);
-
-       case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
-               if (ret) {
-                       uint64_t *grid_size = ret;
-                       grid_size[0] = 65535;
-                       grid_size[1] = 65535;
-                       grid_size[2] = 65535;
-               }
-               return 3 * sizeof(uint64_t) ;
-
-       case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
-               if (ret) {
-                       uint64_t *block_size = ret;
-                       unsigned threads_per_block = get_max_threads_per_block(sscreen, ir_type);
-                       block_size[0] = threads_per_block;
-                       block_size[1] = threads_per_block;
-                       block_size[2] = threads_per_block;
-               }
-               return 3 * sizeof(uint64_t);
-
-       case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
-               if (ret) {
-                       uint64_t *max_threads_per_block = ret;
-                       *max_threads_per_block = get_max_threads_per_block(sscreen, ir_type);
-               }
-               return sizeof(uint64_t);
-       case PIPE_COMPUTE_CAP_ADDRESS_BITS:
-               if (ret) {
-                       uint32_t *address_bits = ret;
-                       address_bits[0] = 64;
-               }
-               return 1 * sizeof(uint32_t);
-
-       case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
-               if (ret) {
-                       uint64_t *max_global_size = ret;
-                       uint64_t max_mem_alloc_size;
-
-                       si_get_compute_param(screen, ir_type,
-                               PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
-                               &max_mem_alloc_size);
-
-                       /* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least
-                        * 1/4 of the MAX_GLOBAL_SIZE.  Since the
-                        * MAX_MEM_ALLOC_SIZE is fixed for older kernels,
-                        * make sure we never report more than
-                        * 4 * MAX_MEM_ALLOC_SIZE.
-                        */
-                       *max_global_size = MIN2(4 * max_mem_alloc_size,
-                                               MAX2(sscreen->info.gart_size,
-                                                    sscreen->info.vram_size));
-               }
-               return sizeof(uint64_t);
-
-       case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
-               if (ret) {
-                       uint64_t *max_local_size = ret;
-                       /* Value reported by the closed source driver. */
-                       *max_local_size = 32768;
-               }
-               return sizeof(uint64_t);
-
-       case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
-               if (ret) {
-                       uint64_t *max_input_size = ret;
-                       /* Value reported by the closed source driver. */
-                       *max_input_size = 1024;
-               }
-               return sizeof(uint64_t);
-
-       case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
-               if (ret) {
-                       uint64_t *max_mem_alloc_size = ret;
-
-                       *max_mem_alloc_size = sscreen->info.max_alloc_size;
-               }
-               return sizeof(uint64_t);
-
-       case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
-               if (ret) {
-                       uint32_t *max_clock_frequency = ret;
-                       *max_clock_frequency = sscreen->info.max_shader_clock;
-               }
-               return sizeof(uint32_t);
-
-       case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
-               if (ret) {
-                       uint32_t *max_compute_units = ret;
-                       *max_compute_units = sscreen->info.num_good_compute_units;
-               }
-               return sizeof(uint32_t);
-
-       case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
-               if (ret) {
-                       uint32_t *images_supported = ret;
-                       *images_supported = 0;
-               }
-               return sizeof(uint32_t);
-       case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
-               break; /* unused */
-       case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
-               if (ret) {
-                       uint32_t *subgroup_size = ret;
-                       *subgroup_size = sscreen->compute_wave_size;
-               }
-               return sizeof(uint32_t);
-       case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
-               if (ret) {
-                       uint64_t *max_variable_threads_per_block = ret;
-                       if (ir_type == PIPE_SHADER_IR_NATIVE)
-                               *max_variable_threads_per_block = 0;
-                       else
-                               *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
-               }
-               return sizeof(uint64_t);
-       }
-
-        fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
-        return 0;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+
+   // TODO: select these params by asic
+   switch (param) {
+   case PIPE_COMPUTE_CAP_IR_TARGET: {
+      const char *gpu, *triple;
+
+      triple = "amdgcn-mesa-mesa3d";
+      gpu = ac_get_llvm_processor_name(sscreen->info.family);
+      if (ret) {
+         sprintf(ret, "%s-%s", gpu, triple);
+      }
+      /* +2 for dash and terminating NIL byte */
+      return (strlen(triple) + strlen(gpu) + 2) * sizeof(char);
+   }
+   case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+      if (ret) {
+         uint64_t *grid_dimension = ret;
+         grid_dimension[0] = 3;
+      }
+      return 1 * sizeof(uint64_t);
+
+   case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+      if (ret) {
+         uint64_t *grid_size = ret;
+         grid_size[0] = 65535;
+         grid_size[1] = 65535;
+         grid_size[2] = 65535;
+      }
+      return 3 * sizeof(uint64_t);
+
+   case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+      if (ret) {
+         uint64_t *block_size = ret;
+         unsigned threads_per_block = get_max_threads_per_block(sscreen, ir_type);
+         block_size[0] = threads_per_block;
+         block_size[1] = threads_per_block;
+         block_size[2] = threads_per_block;
+      }
+      return 3 * sizeof(uint64_t);
+
+   case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+      if (ret) {
+         uint64_t *max_threads_per_block = ret;
+         *max_threads_per_block = get_max_threads_per_block(sscreen, ir_type);
+      }
+      return sizeof(uint64_t);
+   case PIPE_COMPUTE_CAP_ADDRESS_BITS:
+      if (ret) {
+         uint32_t *address_bits = ret;
+         address_bits[0] = 64;
+      }
+      return 1 * sizeof(uint32_t);
+
+   case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
+      if (ret) {
+         uint64_t *max_global_size = ret;
+         uint64_t max_mem_alloc_size;
+
+         si_get_compute_param(screen, ir_type, PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
+                              &max_mem_alloc_size);
+
+         /* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least
+          * 1/4 of the MAX_GLOBAL_SIZE.  Since the
+          * MAX_MEM_ALLOC_SIZE is fixed for older kernels,
+          * make sure we never report more than
+          * 4 * MAX_MEM_ALLOC_SIZE.
+          */
+         *max_global_size =
+            MIN2(4 * max_mem_alloc_size, MAX2(sscreen->info.gart_size, sscreen->info.vram_size));
+      }
+      return sizeof(uint64_t);
+
+   case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
+      if (ret) {
+         uint64_t *max_local_size = ret;
+         /* Value reported by the closed source driver. */
+         *max_local_size = 32768;
+      }
+      return sizeof(uint64_t);
+
+   case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
+      if (ret) {
+         uint64_t *max_input_size = ret;
+         /* Value reported by the closed source driver. */
+         *max_input_size = 1024;
+      }
+      return sizeof(uint64_t);
+
+   case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+      if (ret) {
+         uint64_t *max_mem_alloc_size = ret;
+
+         *max_mem_alloc_size = sscreen->info.max_alloc_size;
+      }
+      return sizeof(uint64_t);
+
+   case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+      if (ret) {
+         uint32_t *max_clock_frequency = ret;
+         *max_clock_frequency = sscreen->info.max_shader_clock;
+      }
+      return sizeof(uint32_t);
+
+   case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+      if (ret) {
+         uint32_t *max_compute_units = ret;
+         *max_compute_units = sscreen->info.num_good_compute_units;
+      }
+      return sizeof(uint32_t);
+
+   case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+      if (ret) {
+         uint32_t *images_supported = ret;
+         *images_supported = 0;
+      }
+      return sizeof(uint32_t);
+   case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
+      break; /* unused */
+   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+      if (ret) {
+         uint32_t *subgroup_size = ret;
+         *subgroup_size = sscreen->compute_wave_size;
+      }
+      return sizeof(uint32_t);
+   case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
+      if (ret) {
+         uint64_t *max_variable_threads_per_block = ret;
+         if (ir_type == PIPE_SHADER_IR_NATIVE)
+            *max_variable_threads_per_block = 0;
+         else
+            *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
+      }
+      return sizeof(uint64_t);
+   }
+
+   fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
+   return 0;
  }
  
  static uint64_t si_get_timestamp(struct pipe_screen *screen)
  {
-       struct si_screen *sscreen = (struct si_screen*)screen;
+   struct si_screen *sscreen = (struct si_screen *)screen;
  
-       return 1000000 * sscreen->ws->query_value(sscreen->ws, RADEON_TIMESTAMP) /
-                       sscreen->info.clock_crystal_freq;
+   return 1000000 * sscreen->ws->query_value(sscreen->ws, RADEON_TIMESTAMP) /
+          sscreen->info.clock_crystal_freq;
  }
  
-static void si_query_memory_info(struct pipe_screen *screen,
-                                struct pipe_memory_info *info)
+static void si_query_memory_info(struct pipe_screen *screen, struct pipe_memory_info *info)
  {
-       struct si_screen *sscreen = (struct si_screen*)screen;
-       struct radeon_winsys *ws = sscreen->ws;
-       unsigned vram_usage, gtt_usage;
-
-       info->total_device_memory = sscreen->info.vram_size / 1024;
-       info->total_staging_memory = sscreen->info.gart_size / 1024;
-
-       /* The real TTM memory usage is somewhat random, because:
-        *
-        * 1) TTM delays freeing memory, because it can only free it after
-        *    fences expire.
-        *
-        * 2) The memory usage can be really low if big VRAM evictions are
-        *    taking place, but the real usage is well above the size of VRAM.
-        *
-        * Instead, return statistics of this process.
-        */
-       vram_usage = ws->query_value(ws, RADEON_VRAM_USAGE) / 1024;
-       gtt_usage =  ws->query_value(ws, RADEON_GTT_USAGE) / 1024;
-
-       info->avail_device_memory =
-               vram_usage <= info->total_device_memory ?
-                               info->total_device_memory - vram_usage : 0;
-       info->avail_staging_memory =
-               gtt_usage <= info->total_staging_memory ?
-                               info->total_staging_memory - gtt_usage : 0;
-
-       info->device_memory_evicted =
-               ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024;
-
-       if (sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 4)
-               info->nr_device_memory_evictions =
-                       ws->query_value(ws, RADEON_NUM_EVICTIONS);
-       else
-               /* Just return the number of evicted 64KB pages. */
-               info->nr_device_memory_evictions = info->device_memory_evicted / 64;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct radeon_winsys *ws = sscreen->ws;
+   unsigned vram_usage, gtt_usage;
+
+   info->total_device_memory = sscreen->info.vram_size / 1024;
+   info->total_staging_memory = sscreen->info.gart_size / 1024;
+
+   /* The real TTM memory usage is somewhat random, because:
+    *
+    * 1) TTM delays freeing memory, because it can only free it after
+    *    fences expire.
+    *
+    * 2) The memory usage can be really low if big VRAM evictions are
+    *    taking place, but the real usage is well above the size of VRAM.
+    *
+    * Instead, return statistics of this process.
+    */
+   vram_usage = ws->query_value(ws, RADEON_VRAM_USAGE) / 1024;
+   gtt_usage = ws->query_value(ws, RADEON_GTT_USAGE) / 1024;
+
+   info->avail_device_memory =
+      vram_usage <= info->total_device_memory ? info->total_device_memory - vram_usage : 0;
+   info->avail_staging_memory =
+      gtt_usage <= info->total_staging_memory ? info->total_staging_memory - gtt_usage : 0;
+
+   info->device_memory_evicted = ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024;
+
+   if (sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 4)
+      info->nr_device_memory_evictions = ws->query_value(ws, RADEON_NUM_EVICTIONS);
+   else
+      /* Just return the number of evicted 64KB pages. */
+      info->nr_device_memory_evictions = info->device_memory_evicted / 64;
  }
  
  static struct disk_cache *si_get_disk_shader_cache(struct pipe_screen *pscreen)
  {
-       struct si_screen *sscreen = (struct si_screen*)pscreen;
+   struct si_screen *sscreen = (struct si_screen *)pscreen;
  
-       return sscreen->disk_shader_cache;
+   return sscreen->disk_shader_cache;
  }
  
  static void si_init_renderer_string(struct si_screen *sscreen)
  {
-       char first_name[256], second_name[32] = {}, kernel_version[128] = {};
-       struct utsname uname_data;
-
-       if (sscreen->info.marketing_name) {
-               snprintf(first_name, sizeof(first_name), "%s",
-                        sscreen->info.marketing_name);
-               snprintf(second_name, sizeof(second_name), "%s, ",
-                        sscreen->info.name);
-       } else {
-               snprintf(first_name, sizeof(first_name), "AMD %s",
-                        sscreen->info.name);
-       }
-
-       if (uname(&uname_data) == 0)
-               snprintf(kernel_version, sizeof(kernel_version),
-                        ", %s", uname_data.release);
-
-       snprintf(sscreen->renderer_string, sizeof(sscreen->renderer_string),
-                "%s (%sDRM %i.%i.%i%s, LLVM " MESA_LLVM_VERSION_STRING ")",
-                first_name, second_name, sscreen->info.drm_major,
-                sscreen->info.drm_minor, sscreen->info.drm_patchlevel,
-                kernel_version);
+   char first_name[256], second_name[32] = {}, kernel_version[128] = {};
+   struct utsname uname_data;
+
+   if (sscreen->info.marketing_name) {
+      snprintf(first_name, sizeof(first_name), "%s", sscreen->info.marketing_name);
+      snprintf(second_name, sizeof(second_name), "%s, ", sscreen->info.name);
+   } else {
+      snprintf(first_name, sizeof(first_name), "AMD %s", sscreen->info.name);
+   }
+
+   if (uname(&uname_data) == 0)
+      snprintf(kernel_version, sizeof(kernel_version), ", %s", uname_data.release);
+
+   snprintf(sscreen->renderer_string, sizeof(sscreen->renderer_string),
+            "%s (%sDRM %i.%i.%i%s, LLVM " MESA_LLVM_VERSION_STRING ")", first_name, second_name,
+            sscreen->info.drm_major, sscreen->info.drm_minor, sscreen->info.drm_patchlevel,
+            kernel_version);
  }
  
  void si_init_screen_get_functions(struct si_screen *sscreen)
  {
-       sscreen->b.get_name = si_get_name;
-       sscreen->b.get_vendor = si_get_vendor;
-       sscreen->b.get_device_vendor = si_get_device_vendor;
-       sscreen->b.get_param = si_get_param;
-       sscreen->b.get_paramf = si_get_paramf;
-       sscreen->b.get_compute_param = si_get_compute_param;
-       sscreen->b.get_timestamp = si_get_timestamp;
-       sscreen->b.get_shader_param = si_get_shader_param;
-       sscreen->b.get_compiler_options = si_get_compiler_options;
-       sscreen->b.get_device_uuid = si_get_device_uuid;
-       sscreen->b.get_driver_uuid = si_get_driver_uuid;
-       sscreen->b.query_memory_info = si_query_memory_info;
-       sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache;
-
-       if (sscreen->info.has_hw_decode) {
-               sscreen->b.get_video_param = si_get_video_param;
-               sscreen->b.is_video_format_supported = si_vid_is_format_supported;
-       } else {
-               sscreen->b.get_video_param = si_get_video_param_no_decode;
-               sscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported;
-       }
-
-       si_init_renderer_string(sscreen);
+   sscreen->b.get_name = si_get_name;
+   sscreen->b.get_vendor = si_get_vendor;
+   sscreen->b.get_device_vendor = si_get_device_vendor;
+   sscreen->b.get_param = si_get_param;
+   sscreen->b.get_paramf = si_get_paramf;
+   sscreen->b.get_compute_param = si_get_compute_param;
+   sscreen->b.get_timestamp = si_get_timestamp;
+   sscreen->b.get_shader_param = si_get_shader_param;
+   sscreen->b.get_compiler_options = si_get_compiler_options;
+   sscreen->b.get_device_uuid = si_get_device_uuid;
+   sscreen->b.get_driver_uuid = si_get_driver_uuid;
+   sscreen->b.query_memory_info = si_query_memory_info;
+   sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache;
+
+   if (sscreen->info.has_hw_decode) {
+      sscreen->b.get_video_param = si_get_video_param;
+      sscreen->b.is_video_format_supported = si_vid_is_format_supported;
+   } else {
+      sscreen->b.get_video_param = si_get_video_param_no_decode;
+      sscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported;
+   }
+
+   si_init_renderer_string(sscreen);
  }
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c

index 9311b6e63864745a1656bed1d810c011414ffa97..30ba6b02f873ac7cc3f472d89e82387ebcd5fde9 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -23,516 +23,499 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "si_pipe.h"
  #include "si_build_pm4.h"
+#include "si_pipe.h"
  #include "sid.h"
-
  #include "util/os_time.h"
  #include "util/u_upload_mgr.h"
  
  /* initialize */
  void si_need_gfx_cs_space(struct si_context *ctx)
  {
-       struct radeon_cmdbuf *cs = ctx->gfx_cs;
-
-       /* There is no need to flush the DMA IB here, because
-        * si_need_dma_space always flushes the GFX IB if there is
-        * a conflict, which means any unflushed DMA commands automatically
-        * precede the GFX IB (= they had no dependency on the GFX IB when
-        * they were submitted).
-        */
-
-       /* There are two memory usage counters in the winsys for all buffers
-        * that have been added (cs_add_buffer) and two counters in the pipe
-        * driver for those that haven't been added yet.
-        */
-       if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs,
-                                                  ctx->vram, ctx->gtt))) {
-               ctx->gtt = 0;
-               ctx->vram = 0;
-               si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-               return;
-       }
-       ctx->gtt = 0;
-       ctx->vram = 0;
-
-       unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx);
-       if (!ctx->ws->cs_check_space(cs, need_dwords, false))
-               si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+   /* There is no need to flush the DMA IB here, because
+    * si_need_dma_space always flushes the GFX IB if there is
+    * a conflict, which means any unflushed DMA commands automatically
+    * precede the GFX IB (= they had no dependency on the GFX IB when
+    * they were submitted).
+    */
+
+   /* There are two memory usage counters in the winsys for all buffers
+    * that have been added (cs_add_buffer) and two counters in the pipe
+    * driver for those that haven't been added yet.
+    */
+   if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs, ctx->vram, ctx->gtt))) {
+      ctx->gtt = 0;
+      ctx->vram = 0;
+      si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+      return;
+   }
+   ctx->gtt = 0;
+   ctx->vram = 0;
+
+   unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx);
+   if (!ctx->ws->cs_check_space(cs, need_dwords, false))
+      si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
  }
  
  void si_unref_sdma_uploads(struct si_context *sctx)
  {
-       for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
-               si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
-               si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
-       }
-       sctx->num_sdma_uploads = 0;
+   for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
+      si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
+      si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
+   }
+   sctx->num_sdma_uploads = 0;
  }
  
-void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
-                    struct pipe_fence_handle **fence)
+void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
  {
-       struct radeon_cmdbuf *cs = ctx->gfx_cs;
-       struct radeon_winsys *ws = ctx->ws;
-       const unsigned wait_ps_cs = SI_CONTEXT_PS_PARTIAL_FLUSH |
-                                   SI_CONTEXT_CS_PARTIAL_FLUSH;
-       unsigned wait_flags = 0;
-
-       if (ctx->gfx_flush_in_progress)
-               return;
-
-       if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
-               wait_flags |= wait_ps_cs |
-                             SI_CONTEXT_INV_L2;
-       } else if (ctx->chip_class == GFX6) {
-               /* The kernel flushes L2 before shaders are finished. */
-               wait_flags |= wait_ps_cs;
-       } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
-               wait_flags |= wait_ps_cs;
-       }
-
-       /* Drop this flush if it's a no-op. */
-       if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&
-           (!wait_flags || !ctx->gfx_last_ib_is_busy))
-               return;
-
-       if (ctx->b.get_device_reset_status(&ctx->b) != PIPE_NO_RESET)
-               return;
-
-       if (ctx->screen->debug_flags & DBG(CHECK_VM))
-               flags &= ~PIPE_FLUSH_ASYNC;
-
-       ctx->gfx_flush_in_progress = true;
-
-       /* If the state tracker is flushing the GFX IB, si_flush_from_st is
-        * responsible for flushing the DMA IB and merging the fences from both.
-        * If the driver flushes the GFX IB internally, and it should never ask
-        * for a fence handle.
-        */
-       assert(!radeon_emitted(ctx->sdma_cs, 0) || fence == NULL);
-
-       /* Update the sdma_uploads list by flushing the uploader. */
-       u_upload_unmap(ctx->b.const_uploader);
-
-       /* Execute SDMA uploads. */
-       ctx->sdma_uploads_in_progress = true;
-       for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
-               struct si_sdma_upload *up = &ctx->sdma_uploads[i];
-
-               assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 &&
-                      up->size % 4 == 0);
-
-               si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b,
-                                   up->dst_offset, up->src_offset, up->size);
-       }
-       ctx->sdma_uploads_in_progress = false;
-       si_unref_sdma_uploads(ctx);
-
-       /* Flush SDMA (preamble IB). */
-       if (radeon_emitted(ctx->sdma_cs, 0))
-               si_flush_dma_cs(ctx, flags, NULL);
-
-       if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) {
-               struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs;
-               si_compute_signal_gfx(ctx);
-
-               /* Make sure compute shaders are idle before leaving the IB, so that
-                * the next IB doesn't overwrite GDS that might be in use. */
-               radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) |
-                                       EVENT_INDEX(4));
-
-               /* Save the GDS prim restart counter if needed. */
-               if (ctx->preserve_prim_restart_gds_at_flush) {
-                       si_cp_copy_data(ctx, compute_cs,
-                                       COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4,
-                                       COPY_DATA_GDS, NULL, 4);
-               }
-       }
-
-       if (ctx->has_graphics) {
-               if (!list_is_empty(&ctx->active_queries))
-                       si_suspend_queries(ctx);
-
-               ctx->streamout.suspended = false;
-               if (ctx->streamout.begin_emitted) {
-                       si_emit_streamout_end(ctx);
-                       ctx->streamout.suspended = true;
-
-                       /* Since NGG streamout uses GDS, we need to make GDS
-                        * idle when we leave the IB, otherwise another process
-                        * might overwrite it while our shaders are busy.
-                        */
-                       if (ctx->screen->use_ngg_streamout)
-                               wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
-               }
-       }
-
-       /* Make sure CP DMA is idle at the end of IBs after L2 prefetches
-        * because the kernel doesn't wait for it. */
-       if (ctx->chip_class >= GFX7)
-               si_cp_dma_wait_for_idle(ctx);
-
-       /* Wait for draw calls to finish if needed. */
-       if (wait_flags) {
-               ctx->flags |= wait_flags;
-               ctx->emit_cache_flush(ctx);
-       }
-       ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;
-
-       if (ctx->current_saved_cs) {
-               si_trace_emit(ctx);
-
-               /* Save the IB for debug contexts. */
-               si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
-               ctx->current_saved_cs->flushed = true;
-               ctx->current_saved_cs->time_flush = os_time_get_nano();
-
-               si_log_hw_flush(ctx);
-       }
-
-       if (si_compute_prim_discard_enabled(ctx)) {
-               /* The compute IB can start after the previous gfx IB starts. */
-               if (radeon_emitted(ctx->prim_discard_compute_cs, 0) &&
-                   ctx->last_gfx_fence) {
-                       ctx->ws->cs_add_fence_dependency(ctx->gfx_cs,
-                                                        ctx->last_gfx_fence,
-                                                        RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY |
-                                                        RADEON_DEPENDENCY_START_FENCE);
-               }
-
-               /* Remember the last execution barrier. It's in the IB.
-                * It will signal the start of the next compute IB.
-                */
-               if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW &&
-                   ctx->last_pkt3_write_data) {
-                       *ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
-                       ctx->last_pkt3_write_data = NULL;
-
-                       si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
-                       ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
-                       si_resource_reference(&ctx->barrier_buf, NULL);
-
-                       ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
-               }
-       }
-
-       /* Flush the CS. */
-       ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
-       if (fence)
-               ws->fence_reference(fence, ctx->last_gfx_fence);
-
-       ctx->num_gfx_cs_flushes++;
-
-       if (si_compute_prim_discard_enabled(ctx)) {
-               /* Remember the last execution barrier, which is the last fence
-                * in this case.
-                */
-               if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
-                       ctx->last_pkt3_write_data = NULL;
-                       si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
-                       ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
-               }
-       }
-
-       /* Check VM faults if needed. */
-       if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
-               /* Use conservative timeout 800ms, after which we won't wait any
-                * longer and assume the GPU is hung.
-                */
-               ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800*1000*1000);
-
-               si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX);
-       }
-
-       if (ctx->current_saved_cs)
-               si_saved_cs_reference(&ctx->current_saved_cs, NULL);
-
-       si_begin_new_gfx_cs(ctx);
-       ctx->gfx_flush_in_progress = false;
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+   struct radeon_winsys *ws = ctx->ws;
+   const unsigned wait_ps_cs = SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+   unsigned wait_flags = 0;
+
+   if (ctx->gfx_flush_in_progress)
+      return;
+
+   if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
+      wait_flags |= wait_ps_cs | SI_CONTEXT_INV_L2;
+   } else if (ctx->chip_class == GFX6) {
+      /* The kernel flushes L2 before shaders are finished. */
+      wait_flags |= wait_ps_cs;
+   } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
+      wait_flags |= wait_ps_cs;
+   }
+
+   /* Drop this flush if it's a no-op. */
+   if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) && (!wait_flags || !ctx->gfx_last_ib_is_busy))
+      return;
+
+   if (ctx->b.get_device_reset_status(&ctx->b) != PIPE_NO_RESET)
+      return;
+
+   if (ctx->screen->debug_flags & DBG(CHECK_VM))
+      flags &= ~PIPE_FLUSH_ASYNC;
+
+   ctx->gfx_flush_in_progress = true;
+
+   /* If the state tracker is flushing the GFX IB, si_flush_from_st is
+    * responsible for flushing the DMA IB and merging the fences from both.
+    * If the driver flushes the GFX IB internally, and it should never ask
+    * for a fence handle.
+    */
+   assert(!radeon_emitted(ctx->sdma_cs, 0) || fence == NULL);
+
+   /* Update the sdma_uploads list by flushing the uploader. */
+   u_upload_unmap(ctx->b.const_uploader);
+
+   /* Execute SDMA uploads. */
+   ctx->sdma_uploads_in_progress = true;
+   for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
+      struct si_sdma_upload *up = &ctx->sdma_uploads[i];
+
+      assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 && up->size % 4 == 0);
+
+      si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b, up->dst_offset, up->src_offset,
+                          up->size);
+   }
+   ctx->sdma_uploads_in_progress = false;
+   si_unref_sdma_uploads(ctx);
+
+   /* Flush SDMA (preamble IB). */
+   if (radeon_emitted(ctx->sdma_cs, 0))
+      si_flush_dma_cs(ctx, flags, NULL);
+
+   if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) {
+      struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs;
+      si_compute_signal_gfx(ctx);
+
+      /* Make sure compute shaders are idle before leaving the IB, so that
+       * the next IB doesn't overwrite GDS that might be in use. */
+      radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+      /* Save the GDS prim restart counter if needed. */
+      if (ctx->preserve_prim_restart_gds_at_flush) {
+         si_cp_copy_data(ctx, compute_cs, COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4,
+                         COPY_DATA_GDS, NULL, 4);
+      }
+   }
+
+   if (ctx->has_graphics) {
+      if (!list_is_empty(&ctx->active_queries))
+         si_suspend_queries(ctx);
+
+      ctx->streamout.suspended = false;
+      if (ctx->streamout.begin_emitted) {
+         si_emit_streamout_end(ctx);
+         ctx->streamout.suspended = true;
+
+         /* Since NGG streamout uses GDS, we need to make GDS
+          * idle when we leave the IB, otherwise another process
+          * might overwrite it while our shaders are busy.
+          */
+         if (ctx->screen->use_ngg_streamout)
+            wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+      }
+   }
+
+   /* Make sure CP DMA is idle at the end of IBs after L2 prefetches
+    * because the kernel doesn't wait for it. */
+   if (ctx->chip_class >= GFX7)
+      si_cp_dma_wait_for_idle(ctx);
+
+   /* Wait for draw calls to finish if needed. */
+   if (wait_flags) {
+      ctx->flags |= wait_flags;
+      ctx->emit_cache_flush(ctx);
+   }
+   ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;
+
+   if (ctx->current_saved_cs) {
+      si_trace_emit(ctx);
+
+      /* Save the IB for debug contexts. */
+      si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
+      ctx->current_saved_cs->flushed = true;
+      ctx->current_saved_cs->time_flush = os_time_get_nano();
+
+      si_log_hw_flush(ctx);
+   }
+
+   if (si_compute_prim_discard_enabled(ctx)) {
+      /* The compute IB can start after the previous gfx IB starts. */
+      if (radeon_emitted(ctx->prim_discard_compute_cs, 0) && ctx->last_gfx_fence) {
+         ctx->ws->cs_add_fence_dependency(
+            ctx->gfx_cs, ctx->last_gfx_fence,
+            RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | RADEON_DEPENDENCY_START_FENCE);
+      }
+
+      /* Remember the last execution barrier. It's in the IB.
+       * It will signal the start of the next compute IB.
+       */
+      if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && ctx->last_pkt3_write_data) {
+         *ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
+         ctx->last_pkt3_write_data = NULL;
+
+         si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
+         ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
+         si_resource_reference(&ctx->barrier_buf, NULL);
+
+         ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
+      }
+   }
+
+   /* Flush the CS. */
+   ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
+   if (fence)
+      ws->fence_reference(fence, ctx->last_gfx_fence);
+
+   ctx->num_gfx_cs_flushes++;
+
+   if (si_compute_prim_discard_enabled(ctx)) {
+      /* Remember the last execution barrier, which is the last fence
+       * in this case.
+       */
+      if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
+         ctx->last_pkt3_write_data = NULL;
+         si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
+         ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
+      }
+   }
+
+   /* Check VM faults if needed. */
+   if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
+      /* Use conservative timeout 800ms, after which we won't wait any
+       * longer and assume the GPU is hung.
+       */
+      ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800 * 1000 * 1000);
+
+      si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX);
+   }
+
+   if (ctx->current_saved_cs)
+      si_saved_cs_reference(&ctx->current_saved_cs, NULL);
+
+   si_begin_new_gfx_cs(ctx);
+   ctx->gfx_flush_in_progress = false;
  }
  
  static void si_begin_gfx_cs_debug(struct si_context *ctx)
  {
-       static const uint32_t zeros[1];
-       assert(!ctx->current_saved_cs);
+   static const uint32_t zeros[1];
+   assert(!ctx->current_saved_cs);
  
-       ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs));
-       if (!ctx->current_saved_cs)
-               return;
+   ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs));
+   if (!ctx->current_saved_cs)
+      return;
  
-       pipe_reference_init(&ctx->current_saved_cs->reference, 1);
+   pipe_reference_init(&ctx->current_saved_cs->reference, 1);
  
-       ctx->current_saved_cs->trace_buf = si_resource(
-               pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
-       if (!ctx->current_saved_cs->trace_buf) {
-               free(ctx->current_saved_cs);
-               ctx->current_saved_cs = NULL;
-               return;
-       }
+   ctx->current_saved_cs->trace_buf =
+      si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
+   if (!ctx->current_saved_cs->trace_buf) {
+      free(ctx->current_saved_cs);
+      ctx->current_saved_cs = NULL;
+      return;
+   }
  
-       pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b,
-                                   0, sizeof(zeros), zeros);
-       ctx->current_saved_cs->trace_id = 0;
+   pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b, 0, sizeof(zeros),
+                               zeros);
+   ctx->current_saved_cs->trace_id = 0;
  
-       si_trace_emit(ctx);
+   si_trace_emit(ctx);
  
-       radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf,
-                             RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
+   radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf,
+                             RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
  }
  
  static void si_add_gds_to_buffer_list(struct si_context *sctx)
  {
-       if (sctx->gds) {
-               sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
-                                      RADEON_USAGE_READWRITE, 0, 0);
-               if (sctx->gds_oa) {
-                       sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
-                                              RADEON_USAGE_READWRITE, 0, 0);
-               }
-       }
+   if (sctx->gds) {
+      sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0);
+      if (sctx->gds_oa) {
+         sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0);
+      }
+   }
  }
  
  void si_allocate_gds(struct si_context *sctx)
  {
-       struct radeon_winsys *ws = sctx->ws;
+   struct radeon_winsys *ws = sctx->ws;
  
-       if (sctx->gds)
-               return;
+   if (sctx->gds)
+      return;
  
-       assert(sctx->screen->use_ngg_streamout);
+   assert(sctx->screen->use_ngg_streamout);
  
-       /* 4 streamout GDS counters.
-        * We need 256B (64 dw) of GDS, otherwise streamout hangs.
-        */
-       sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, 0);
-       sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, 0);
+   /* 4 streamout GDS counters.
+    * We need 256B (64 dw) of GDS, otherwise streamout hangs.
+    */
+   sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, 0);
+   sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, 0);
  
-       assert(sctx->gds && sctx->gds_oa);
-       si_add_gds_to_buffer_list(sctx);
+   assert(sctx->gds && sctx->gds_oa);
+   si_add_gds_to_buffer_list(sctx);
  }
  
  void si_begin_new_gfx_cs(struct si_context *ctx)
  {
-       if (ctx->is_debug)
-               si_begin_gfx_cs_debug(ctx);
-
-       si_add_gds_to_buffer_list(ctx);
-
-       /* Always invalidate caches at the beginning of IBs, because external
-        * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
-        * buffers.
-        *
-        * Note that the cache flush done by the kernel at the end of GFX IBs
-        * isn't useful here, because that flush can finish after the following
-        * IB starts drawing.
-        *
-        * TODO: Do we also need to invalidate CB & DB caches?
-        */
-       ctx->flags |= SI_CONTEXT_INV_ICACHE |
-                     SI_CONTEXT_INV_SCACHE |
-                     SI_CONTEXT_INV_VCACHE |
-                     SI_CONTEXT_INV_L2 |
-                     SI_CONTEXT_START_PIPELINE_STATS;
-
-       ctx->cs_shader_state.initialized = false;
-       si_all_descriptors_begin_new_cs(ctx);
-
-       if (!ctx->has_graphics) {
-               ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
-               return;
-       }
-
-       /* set all valid group as dirty so they get reemited on
-        * next draw command
-        */
-       si_pm4_reset_emitted(ctx);
-
-       /* The CS initialization should be emitted before everything else. */
-       si_pm4_emit(ctx, ctx->init_config);
-       if (ctx->init_config_gs_rings)
-               si_pm4_emit(ctx, ctx->init_config_gs_rings);
-
-       if (ctx->queued.named.ls)
-               ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
-       if (ctx->queued.named.hs)
-               ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
-       if (ctx->queued.named.es)
-               ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
-       if (ctx->queued.named.gs)
-               ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
-       if (ctx->queued.named.vs)
-               ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
-       if (ctx->queued.named.ps)
-               ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
-       if (ctx->vb_descriptors_buffer && ctx->vertex_elements)
-               ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
-
-       /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
-       bool has_clear_state = ctx->screen->info.has_clear_state;
-       if (has_clear_state) {
-               ctx->framebuffer.dirty_cbufs =
-                        u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
-               /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
-               ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
-       } else {
-               ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
-               ctx->framebuffer.dirty_zsbuf = true;
-       }
-       /* This should always be marked as dirty to set the framebuffer scissor
-        * at least. */
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
-
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);
-       /* CLEAR_STATE sets zeros. */
-       if (!has_clear_state || ctx->clip_state.any_nonzeros)
-               si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);
-       ctx->sample_locs_num_samples = 0;
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);
-       /* CLEAR_STATE sets 0xffff. */
-       if (!has_clear_state || ctx->sample_mask != 0xffff)
-               si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);
-       /* CLEAR_STATE sets zeros. */
-       if (!has_clear_state || ctx->blend_color.any_nonzeros)
-               si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);
-       if (ctx->chip_class >= GFX9)
-               si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
-       if (!ctx->screen->use_ngg_streamout)
-               si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
-       /* CLEAR_STATE disables all window rectangles. */
-       if (!has_clear_state || ctx->num_window_rectangles > 0)
-               si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
-
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
-
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
-       if (ctx->scratch_buffer) {
-               si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
-       }
-
-       if (ctx->streamout.suspended) {
-               ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;
-               si_streamout_buffers_dirty(ctx);
-       }
-
-       if (!list_is_empty(&ctx->active_queries))
-               si_resume_queries(ctx);
-
-       assert(!ctx->gfx_cs->prev_dw);
-       ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
-
-       /* Invalidate various draw states so that they are emitted before
-        * the first draw call. */
-       si_invalidate_draw_sh_constants(ctx);
-       ctx->last_index_size = -1;
-       ctx->last_primitive_restart_en = -1;
-       ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
-       ctx->last_prim = -1;
-       ctx->last_multi_vgt_param = -1;
-       ctx->last_vs_state = ~0;
-       ctx->last_ls = NULL;
-       ctx->last_tcs = NULL;
-       ctx->last_tes_sh_base = -1;
-       ctx->last_num_tcs_input_cp = -1;
-       ctx->last_ls_hs_config = -1; /* impossible value */
-       ctx->last_binning_enabled = -1;
-       ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;
-
-       ctx->prim_discard_compute_ib_initialized = false;
-
-        /* Compute-based primitive discard:
-         *   The index ring is divided into 2 halves. Switch between the halves
-         *   in the same fashion as doublebuffering.
-         */
-        if (ctx->index_ring_base)
-                ctx->index_ring_base = 0;
-        else
-                ctx->index_ring_base = ctx->index_ring_size_per_ib;
-
-        ctx->index_ring_offset = 0;
-
-       STATIC_ASSERT(SI_NUM_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved) * 8);
-
-       if (has_clear_state) {
-               ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
-               ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x00001000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__VS] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__CL] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
-               ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL] = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ]  = 0x3f800000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ]  = 0x3f800000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ]  = 0x3f800000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ]  = 0x3f800000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE]     = 0xffff;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE]      = 0;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_IDX_FORMAT]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL]  = 0x00000002;
-               ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK]  = 0xffffffff;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM]  = 0x00000000;
-               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL]  = 0x0000001e; /* From GFX8 */
-
-               /* Set all cleared context registers to saved. */
-               ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */
-               ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */
-       } else {
-               /* Set all register values to unknown. */
-               ctx->tracked_regs.reg_saved = 0;
-               ctx->last_gs_out_prim = -1; /* unknown */
-       }
-
-       /* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */
-       memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
+   if (ctx->is_debug)
+      si_begin_gfx_cs_debug(ctx);
+
+   si_add_gds_to_buffer_list(ctx);
+
+   /* Always invalidate caches at the beginning of IBs, because external
+    * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
+    * buffers.
+    *
+    * Note that the cache flush done by the kernel at the end of GFX IBs
+    * isn't useful here, because that flush can finish after the following
+    * IB starts drawing.
+    *
+    * TODO: Do we also need to invalidate CB & DB caches?
+    */
+   ctx->flags |= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+                 SI_CONTEXT_INV_L2 | SI_CONTEXT_START_PIPELINE_STATS;
+
+   ctx->cs_shader_state.initialized = false;
+   si_all_descriptors_begin_new_cs(ctx);
+
+   if (!ctx->has_graphics) {
+      ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
+      return;
+   }
+
+   /* set all valid group as dirty so they get reemited on
+    * next draw command
+    */
+   si_pm4_reset_emitted(ctx);
+
+   /* The CS initialization should be emitted before everything else. */
+   si_pm4_emit(ctx, ctx->init_config);
+   if (ctx->init_config_gs_rings)
+      si_pm4_emit(ctx, ctx->init_config_gs_rings);
+
+   if (ctx->queued.named.ls)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+   if (ctx->queued.named.hs)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+   if (ctx->queued.named.es)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+   if (ctx->queued.named.gs)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+   if (ctx->queued.named.vs)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+   if (ctx->queued.named.ps)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+   if (ctx->vb_descriptors_buffer && ctx->vertex_elements)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
+
+   /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
+   bool has_clear_state = ctx->screen->info.has_clear_state;
+   if (has_clear_state) {
+      ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
+      /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
+      ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
+   } else {
+      ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
+      ctx->framebuffer.dirty_zsbuf = true;
+   }
+   /* This should always be marked as dirty to set the framebuffer scissor
+    * at least. */
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
+
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);
+   /* CLEAR_STATE sets zeros. */
+   if (!has_clear_state || ctx->clip_state.any_nonzeros)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);
+   ctx->sample_locs_num_samples = 0;
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);
+   /* CLEAR_STATE sets 0xffff. */
+   if (!has_clear_state || ctx->sample_mask != 0xffff)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);
+   /* CLEAR_STATE sets zeros. */
+   if (!has_clear_state || ctx->blend_color.any_nonzeros)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);
+   if (ctx->chip_class >= GFX9)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
+   if (!ctx->screen->use_ngg_streamout)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
+   /* CLEAR_STATE disables all window rectangles. */
+   if (!has_clear_state || ctx->num_window_rectangles > 0)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
+
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
+   if (ctx->scratch_buffer) {
+      si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
+   }
+
+   if (ctx->streamout.suspended) {
+      ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;
+      si_streamout_buffers_dirty(ctx);
+   }
+
+   if (!list_is_empty(&ctx->active_queries))
+      si_resume_queries(ctx);
+
+   assert(!ctx->gfx_cs->prev_dw);
+   ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
+
+   /* Invalidate various draw states so that they are emitted before
+    * the first draw call. */
+   si_invalidate_draw_sh_constants(ctx);
+   ctx->last_index_size = -1;
+   ctx->last_primitive_restart_en = -1;
+   ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
+   ctx->last_prim = -1;
+   ctx->last_multi_vgt_param = -1;
+   ctx->last_vs_state = ~0;
+   ctx->last_ls = NULL;
+   ctx->last_tcs = NULL;
+   ctx->last_tes_sh_base = -1;
+   ctx->last_num_tcs_input_cp = -1;
+   ctx->last_ls_hs_config = -1; /* impossible value */
+   ctx->last_binning_enabled = -1;
+   ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;
+
+   ctx->prim_discard_compute_ib_initialized = false;
+
+   /* Compute-based primitive discard:
+    *   The index ring is divided into 2 halves. Switch between the halves
+    *   in the same fashion as doublebuffering.
+    */
+   if (ctx->index_ring_base)
+      ctx->index_ring_base = 0;
+   else
+      ctx->index_ring_base = ctx->index_ring_size_per_ib;
+
+   ctx->index_ring_offset = 0;
+
+   STATIC_ASSERT(SI_NUM_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved) * 8);
+
+   if (has_clear_state) {
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
+      ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x00001000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__VS] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__CL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_IDX_FORMAT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x00000002;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] =
+         0x0000001e; /* From GFX8 */
+
+      /* Set all cleared context registers to saved. */
+      ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */
+      ctx->last_gs_out_prim = 0;                                       /* cleared by CLEAR_STATE */
+   } else {
+      /* Set all register values to unknown. */
+      ctx->tracked_regs.reg_saved = 0;
+      ctx->last_gs_out_prim = -1; /* unknown */
+   }
+
+   /* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */
+   memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
  }
diff --git a/src/gallium/drivers/radeonsi/si_gpu_load.c b/src/gallium/drivers/radeonsi/si_gpu_load.c

index 33cd5642230fafdf732b9e87fb9b71ba0a347c82..806f98ad520286f9c52b87233ae9ef93d5716bbc 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_gpu_load.c
+++ b/src/gallium/drivers/radeonsi/si_gpu_load.c
@@ -40,242 +40,234 @@
   * fps (there are too few samples per frame). */
  #define SAMPLES_PER_SEC 10000
  
-#define GRBM_STATUS            0x8010
-#define TA_BUSY(x)             (((x) >> 14) & 0x1)
-#define GDS_BUSY(x)            (((x) >> 15) & 0x1)
-#define VGT_BUSY(x)            (((x) >> 17) & 0x1)
-#define IA_BUSY(x)             (((x) >> 19) & 0x1)
-#define SX_BUSY(x)             (((x) >> 20) & 0x1)
-#define WD_BUSY(x)             (((x) >> 21) & 0x1)
-#define SPI_BUSY(x)            (((x) >> 22) & 0x1)
-#define BCI_BUSY(x)            (((x) >> 23) & 0x1)
-#define SC_BUSY(x)             (((x) >> 24) & 0x1)
-#define PA_BUSY(x)             (((x) >> 25) & 0x1)
-#define DB_BUSY(x)             (((x) >> 26) & 0x1)
-#define CP_BUSY(x)             (((x) >> 29) & 0x1)
-#define CB_BUSY(x)             (((x) >> 30) & 0x1)
-#define GUI_ACTIVE(x)          (((x) >> 31) & 0x1)
-
-#define SRBM_STATUS2           0x0e4c
-#define SDMA_BUSY(x)           (((x) >> 5) & 0x1)
-
-#define CP_STAT                 0x8680
-#define PFP_BUSY(x)            (((x) >> 15) & 0x1)
-#define MEQ_BUSY(x)            (((x) >> 16) & 0x1)
-#define ME_BUSY(x)             (((x) >> 17) & 0x1)
-#define SURFACE_SYNC_BUSY(x)   (((x) >> 21) & 0x1)
-#define DMA_BUSY(x)            (((x) >> 22) & 0x1)
-#define SCRATCH_RAM_BUSY(x)    (((x) >> 24) & 0x1)
+#define GRBM_STATUS   0x8010
+#define TA_BUSY(x)    (((x) >> 14) & 0x1)
+#define GDS_BUSY(x)   (((x) >> 15) & 0x1)
+#define VGT_BUSY(x)   (((x) >> 17) & 0x1)
+#define IA_BUSY(x)    (((x) >> 19) & 0x1)
+#define SX_BUSY(x)    (((x) >> 20) & 0x1)
+#define WD_BUSY(x)    (((x) >> 21) & 0x1)
+#define SPI_BUSY(x)   (((x) >> 22) & 0x1)
+#define BCI_BUSY(x)   (((x) >> 23) & 0x1)
+#define SC_BUSY(x)    (((x) >> 24) & 0x1)
+#define PA_BUSY(x)    (((x) >> 25) & 0x1)
+#define DB_BUSY(x)    (((x) >> 26) & 0x1)
+#define CP_BUSY(x)    (((x) >> 29) & 0x1)
+#define CB_BUSY(x)    (((x) >> 30) & 0x1)
+#define GUI_ACTIVE(x) (((x) >> 31) & 0x1)
+
+#define SRBM_STATUS2 0x0e4c
+#define SDMA_BUSY(x) (((x) >> 5) & 0x1)
+
+#define CP_STAT              0x8680
+#define PFP_BUSY(x)          (((x) >> 15) & 0x1)
+#define MEQ_BUSY(x)          (((x) >> 16) & 0x1)
+#define ME_BUSY(x)           (((x) >> 17) & 0x1)
+#define SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1)
+#define DMA_BUSY(x)          (((x) >> 22) & 0x1)
+#define SCRATCH_RAM_BUSY(x)  (((x) >> 24) & 0x1)
  
  #define IDENTITY(x) x
  
-#define UPDATE_COUNTER(field, mask)                                    \
-       do {                                                            \
-               if (mask(value))                                        \
-                       p_atomic_inc(&counters->named.field.busy);      \
-               else                                                    \
-                       p_atomic_inc(&counters->named.field.idle);      \
-       } while (0)
+#define UPDATE_COUNTER(field, mask)                                                                \
+   do {                                                                                            \
+      if (mask(value))                                                                             \
+         p_atomic_inc(&counters->named.field.busy);                                                \
+      else                                                                                         \
+         p_atomic_inc(&counters->named.field.idle);                                                \
+   } while (0)
  
-static void si_update_mmio_counters(struct si_screen *sscreen,
-                                   union si_mmio_counters *counters)
+static void si_update_mmio_counters(struct si_screen *sscreen, union si_mmio_counters *counters)
  {
-       uint32_t value = 0;
-       bool gui_busy, sdma_busy = false;
-
-       /* GRBM_STATUS */
-       sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);
-
-       UPDATE_COUNTER(ta, TA_BUSY);
-       UPDATE_COUNTER(gds, GDS_BUSY);
-       UPDATE_COUNTER(vgt, VGT_BUSY);
-       UPDATE_COUNTER(ia, IA_BUSY);
-       UPDATE_COUNTER(sx, SX_BUSY);
-       UPDATE_COUNTER(wd, WD_BUSY);
-       UPDATE_COUNTER(spi, SPI_BUSY);
-       UPDATE_COUNTER(bci, BCI_BUSY);
-       UPDATE_COUNTER(sc, SC_BUSY);
-       UPDATE_COUNTER(pa, PA_BUSY);
-       UPDATE_COUNTER(db, DB_BUSY);
-       UPDATE_COUNTER(cp, CP_BUSY);
-       UPDATE_COUNTER(cb, CB_BUSY);
-       UPDATE_COUNTER(gui, GUI_ACTIVE);
-       gui_busy = GUI_ACTIVE(value);
-
-       if (sscreen->info.chip_class == GFX7 || sscreen->info.chip_class == GFX8) {
-               /* SRBM_STATUS2 */
-               sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value);
-
-               UPDATE_COUNTER(sdma, SDMA_BUSY);
-               sdma_busy = SDMA_BUSY(value);
-       }
-
-       if (sscreen->info.chip_class >= GFX8) {
-               /* CP_STAT */
-               sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value);
-
-               UPDATE_COUNTER(pfp, PFP_BUSY);
-               UPDATE_COUNTER(meq, MEQ_BUSY);
-               UPDATE_COUNTER(me, ME_BUSY);
-               UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY);
-               UPDATE_COUNTER(cp_dma, DMA_BUSY);
-               UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY);
-       }
-
-       value = gui_busy || sdma_busy;
-       UPDATE_COUNTER(gpu, IDENTITY);
+   uint32_t value = 0;
+   bool gui_busy, sdma_busy = false;
+
+   /* GRBM_STATUS */
+   sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);
+
+   UPDATE_COUNTER(ta, TA_BUSY);
+   UPDATE_COUNTER(gds, GDS_BUSY);
+   UPDATE_COUNTER(vgt, VGT_BUSY);
+   UPDATE_COUNTER(ia, IA_BUSY);
+   UPDATE_COUNTER(sx, SX_BUSY);
+   UPDATE_COUNTER(wd, WD_BUSY);
+   UPDATE_COUNTER(spi, SPI_BUSY);
+   UPDATE_COUNTER(bci, BCI_BUSY);
+   UPDATE_COUNTER(sc, SC_BUSY);
+   UPDATE_COUNTER(pa, PA_BUSY);
+   UPDATE_COUNTER(db, DB_BUSY);
+   UPDATE_COUNTER(cp, CP_BUSY);
+   UPDATE_COUNTER(cb, CB_BUSY);
+   UPDATE_COUNTER(gui, GUI_ACTIVE);
+   gui_busy = GUI_ACTIVE(value);
+
+   if (sscreen->info.chip_class == GFX7 || sscreen->info.chip_class == GFX8) {
+      /* SRBM_STATUS2 */
+      sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value);
+
+      UPDATE_COUNTER(sdma, SDMA_BUSY);
+      sdma_busy = SDMA_BUSY(value);
+   }
+
+   if (sscreen->info.chip_class >= GFX8) {
+      /* CP_STAT */
+      sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value);
+
+      UPDATE_COUNTER(pfp, PFP_BUSY);
+      UPDATE_COUNTER(meq, MEQ_BUSY);
+      UPDATE_COUNTER(me, ME_BUSY);
+      UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY);
+      UPDATE_COUNTER(cp_dma, DMA_BUSY);
+      UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY);
+   }
+
+   value = gui_busy || sdma_busy;
+   UPDATE_COUNTER(gpu, IDENTITY);
  }
  
  #undef UPDATE_COUNTER
  
-static int
-si_gpu_load_thread(void *param)
+static int si_gpu_load_thread(void *param)
  {
-       struct si_screen *sscreen = (struct si_screen*)param;
-       const int period_us = 1000000 / SAMPLES_PER_SEC;
-       int sleep_us = period_us;
-       int64_t cur_time, last_time = os_time_get();
-
-       while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) {
-               if (sleep_us)
-                       os_time_sleep(sleep_us);
-
-               /* Make sure we sleep the ideal amount of time to match
-                * the expected frequency. */
-               cur_time = os_time_get();
-
-               if (os_time_timeout(last_time, last_time + period_us,
-                                   cur_time))
-                       sleep_us = MAX2(sleep_us - 1, 1);
-               else
-                       sleep_us += 1;
-
-               /*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/
-               last_time = cur_time;
-
-               /* Update the counters. */
-               si_update_mmio_counters(sscreen, &sscreen->mmio_counters);
-       }
-       p_atomic_dec(&sscreen->gpu_load_stop_thread);
-       return 0;
+   struct si_screen *sscreen = (struct si_screen *)param;
+   const int period_us = 1000000 / SAMPLES_PER_SEC;
+   int sleep_us = period_us;
+   int64_t cur_time, last_time = os_time_get();
+
+   while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) {
+      if (sleep_us)
+         os_time_sleep(sleep_us);
+
+      /* Make sure we sleep the ideal amount of time to match
+       * the expected frequency. */
+      cur_time = os_time_get();
+
+      if (os_time_timeout(last_time, last_time + period_us, cur_time))
+         sleep_us = MAX2(sleep_us - 1, 1);
+      else
+         sleep_us += 1;
+
+      /*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/
+      last_time = cur_time;
+
+      /* Update the counters. */
+      si_update_mmio_counters(sscreen, &sscreen->mmio_counters);
+   }
+   p_atomic_dec(&sscreen->gpu_load_stop_thread);
+   return 0;
  }
  
  void si_gpu_load_kill_thread(struct si_screen *sscreen)
  {
-       if (!sscreen->gpu_load_thread)
-               return;
+   if (!sscreen->gpu_load_thread)
+      return;
  
-       p_atomic_inc(&sscreen->gpu_load_stop_thread);
-       thrd_join(sscreen->gpu_load_thread, NULL);
-       sscreen->gpu_load_thread = 0;
+   p_atomic_inc(&sscreen->gpu_load_stop_thread);
+   thrd_join(sscreen->gpu_load_thread, NULL);
+   sscreen->gpu_load_thread = 0;
  }
  
-static uint64_t si_read_mmio_counter(struct si_screen *sscreen,
-                                    unsigned busy_index)
+static uint64_t si_read_mmio_counter(struct si_screen *sscreen, unsigned busy_index)
  {
-       /* Start the thread if needed. */
-       if (!sscreen->gpu_load_thread) {
-               simple_mtx_lock(&sscreen->gpu_load_mutex);
-               /* Check again inside the mutex. */
-               if (!sscreen->gpu_load_thread)
-                       sscreen->gpu_load_thread =
-                               u_thread_create(si_gpu_load_thread, sscreen);
-               simple_mtx_unlock(&sscreen->gpu_load_mutex);
-       }
-
-       unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]);
-       unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]);
-
-       return busy | ((uint64_t)idle << 32);
+   /* Start the thread if needed. */
+   if (!sscreen->gpu_load_thread) {
+      simple_mtx_lock(&sscreen->gpu_load_mutex);
+      /* Check again inside the mutex. */
+      if (!sscreen->gpu_load_thread)
+         sscreen->gpu_load_thread = u_thread_create(si_gpu_load_thread, sscreen);
+      simple_mtx_unlock(&sscreen->gpu_load_mutex);
+   }
+
+   unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]);
+   unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]);
+
+   return busy | ((uint64_t)idle << 32);
  }
  
-static unsigned si_end_mmio_counter(struct si_screen *sscreen,
-                                   uint64_t begin, unsigned busy_index)
+static unsigned si_end_mmio_counter(struct si_screen *sscreen, uint64_t begin, unsigned busy_index)
  {
-       uint64_t end = si_read_mmio_counter(sscreen, busy_index);
-       unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff);
-       unsigned idle = (end >> 32) - (begin >> 32);
-
-       /* Calculate the % of time the busy counter was being incremented.
-        *
-        * If no counters were incremented, return the current counter status.
-        * It's for the case when the load is queried faster than
-        * the counters are updated.
-        */
-       if (idle || busy) {
-               return busy*100 / (busy + idle);
-       } else {
-               union si_mmio_counters counters;
-
-               memset(&counters, 0, sizeof(counters));
-               si_update_mmio_counters(sscreen, &counters);
-               return counters.array[busy_index] ? 100 : 0;
-       }
+   uint64_t end = si_read_mmio_counter(sscreen, busy_index);
+   unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff);
+   unsigned idle = (end >> 32) - (begin >> 32);
+
+   /* Calculate the % of time the busy counter was being incremented.
+    *
+    * If no counters were incremented, return the current counter status.
+    * It's for the case when the load is queried faster than
+    * the counters are updated.
+    */
+   if (idle || busy) {
+      return busy * 100 / (busy + idle);
+   } else {
+      union si_mmio_counters counters;
+
+      memset(&counters, 0, sizeof(counters));
+      si_update_mmio_counters(sscreen, &counters);
+      return counters.array[busy_index] ? 100 : 0;
+   }
  }
  
-#define BUSY_INDEX(sscreen, field) (&sscreen->mmio_counters.named.field.busy - \
-                                   sscreen->mmio_counters.array)
+#define BUSY_INDEX(sscreen, field)                                                                 \
+   (&sscreen->mmio_counters.named.field.busy - sscreen->mmio_counters.array)
  
-static unsigned busy_index_from_type(struct si_screen *sscreen,
-                                    unsigned type)
+static unsigned busy_index_from_type(struct si_screen *sscreen, unsigned type)
  {
-       switch (type) {
-       case SI_QUERY_GPU_LOAD:
-               return BUSY_INDEX(sscreen, gpu);
-       case SI_QUERY_GPU_SHADERS_BUSY:
-               return BUSY_INDEX(sscreen, spi);
-       case SI_QUERY_GPU_TA_BUSY:
-               return BUSY_INDEX(sscreen, ta);
-       case SI_QUERY_GPU_GDS_BUSY:
-               return BUSY_INDEX(sscreen, gds);
-       case SI_QUERY_GPU_VGT_BUSY:
-               return BUSY_INDEX(sscreen, vgt);
-       case SI_QUERY_GPU_IA_BUSY:
-               return BUSY_INDEX(sscreen, ia);
-       case SI_QUERY_GPU_SX_BUSY:
-               return BUSY_INDEX(sscreen, sx);
-       case SI_QUERY_GPU_WD_BUSY:
-               return BUSY_INDEX(sscreen, wd);
-       case SI_QUERY_GPU_BCI_BUSY:
-               return BUSY_INDEX(sscreen, bci);
-       case SI_QUERY_GPU_SC_BUSY:
-               return BUSY_INDEX(sscreen, sc);
-       case SI_QUERY_GPU_PA_BUSY:
-               return BUSY_INDEX(sscreen, pa);
-       case SI_QUERY_GPU_DB_BUSY:
-               return BUSY_INDEX(sscreen, db);
-       case SI_QUERY_GPU_CP_BUSY:
-               return BUSY_INDEX(sscreen, cp);
-       case SI_QUERY_GPU_CB_BUSY:
-               return BUSY_INDEX(sscreen, cb);
-       case SI_QUERY_GPU_SDMA_BUSY:
-               return BUSY_INDEX(sscreen, sdma);
-       case SI_QUERY_GPU_PFP_BUSY:
-               return BUSY_INDEX(sscreen, pfp);
-       case SI_QUERY_GPU_MEQ_BUSY:
-               return BUSY_INDEX(sscreen, meq);
-       case SI_QUERY_GPU_ME_BUSY:
-               return BUSY_INDEX(sscreen, me);
-       case SI_QUERY_GPU_SURF_SYNC_BUSY:
-               return BUSY_INDEX(sscreen, surf_sync);
-       case SI_QUERY_GPU_CP_DMA_BUSY:
-               return BUSY_INDEX(sscreen, cp_dma);
-       case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
-               return BUSY_INDEX(sscreen, scratch_ram);
-       default:
-               unreachable("invalid query type");
-       }
+   switch (type) {
+   case SI_QUERY_GPU_LOAD:
+      return BUSY_INDEX(sscreen, gpu);
+   case SI_QUERY_GPU_SHADERS_BUSY:
+      return BUSY_INDEX(sscreen, spi);
+   case SI_QUERY_GPU_TA_BUSY:
+      return BUSY_INDEX(sscreen, ta);
+   case SI_QUERY_GPU_GDS_BUSY:
+      return BUSY_INDEX(sscreen, gds);
+   case SI_QUERY_GPU_VGT_BUSY:
+      return BUSY_INDEX(sscreen, vgt);
+   case SI_QUERY_GPU_IA_BUSY:
+      return BUSY_INDEX(sscreen, ia);
+   case SI_QUERY_GPU_SX_BUSY:
+      return BUSY_INDEX(sscreen, sx);
+   case SI_QUERY_GPU_WD_BUSY:
+      return BUSY_INDEX(sscreen, wd);
+   case SI_QUERY_GPU_BCI_BUSY:
+      return BUSY_INDEX(sscreen, bci);
+   case SI_QUERY_GPU_SC_BUSY:
+      return BUSY_INDEX(sscreen, sc);
+   case SI_QUERY_GPU_PA_BUSY:
+      return BUSY_INDEX(sscreen, pa);
+   case SI_QUERY_GPU_DB_BUSY:
+      return BUSY_INDEX(sscreen, db);
+   case SI_QUERY_GPU_CP_BUSY:
+      return BUSY_INDEX(sscreen, cp);
+   case SI_QUERY_GPU_CB_BUSY:
+      return BUSY_INDEX(sscreen, cb);
+   case SI_QUERY_GPU_SDMA_BUSY:
+      return BUSY_INDEX(sscreen, sdma);
+   case SI_QUERY_GPU_PFP_BUSY:
+      return BUSY_INDEX(sscreen, pfp);
+   case SI_QUERY_GPU_MEQ_BUSY:
+      return BUSY_INDEX(sscreen, meq);
+   case SI_QUERY_GPU_ME_BUSY:
+      return BUSY_INDEX(sscreen, me);
+   case SI_QUERY_GPU_SURF_SYNC_BUSY:
+      return BUSY_INDEX(sscreen, surf_sync);
+   case SI_QUERY_GPU_CP_DMA_BUSY:
+      return BUSY_INDEX(sscreen, cp_dma);
+   case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+      return BUSY_INDEX(sscreen, scratch_ram);
+   default:
+      unreachable("invalid query type");
+   }
  }
  
  uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type)
  {
-       unsigned busy_index = busy_index_from_type(sscreen, type);
-       return si_read_mmio_counter(sscreen, busy_index);
+   unsigned busy_index = busy_index_from_type(sscreen, type);
+   return si_read_mmio_counter(sscreen, busy_index);
  }
  
-unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
-                       uint64_t begin)
+unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin)
  {
-       unsigned busy_index = busy_index_from_type(sscreen, type);
-       return si_end_mmio_counter(sscreen, begin, busy_index);
+   unsigned busy_index = busy_index_from_type(sscreen, type);
+   return si_end_mmio_counter(sscreen, begin, busy_index);
  }
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c

index 974ac430c530a6a48a12eedb830f6d07a35025f7..ca13ca8a639b1357a6ae590f88ae476037207451 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -26,101 +26,101 @@
  #include "si_query.h"
  #include "util/u_memory.h"
  
+enum si_pc_block_flags
+{
+   /* This block is part of the shader engine */
+   SI_PC_BLOCK_SE = (1 << 0),
  
-enum si_pc_block_flags {
-       /* This block is part of the shader engine */
-       SI_PC_BLOCK_SE = (1 << 0),
-
-       /* Expose per-instance groups instead of summing all instances (within
-        * an SE). */
-       SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
+   /* Expose per-instance groups instead of summing all instances (within
+    * an SE). */
+   SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
  
-       /* Expose per-SE groups instead of summing instances across SEs. */
-       SI_PC_BLOCK_SE_GROUPS = (1 << 2),
+   /* Expose per-SE groups instead of summing instances across SEs. */
+   SI_PC_BLOCK_SE_GROUPS = (1 << 2),
  
-       /* Shader block */
-       SI_PC_BLOCK_SHADER = (1 << 3),
+   /* Shader block */
+   SI_PC_BLOCK_SHADER = (1 << 3),
  
-       /* Non-shader block with perfcounters windowed by shaders. */
-       SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
+   /* Non-shader block with perfcounters windowed by shaders. */
+   SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
  };
  
-enum si_pc_reg_layout {
-       /* All secondary selector dwords follow as one block after the primary
-        * selector dwords for the counters that have secondary selectors.
-        */
-       SI_PC_MULTI_BLOCK = 0,
+enum si_pc_reg_layout
+{
+   /* All secondary selector dwords follow as one block after the primary
+    * selector dwords for the counters that have secondary selectors.
+    */
+   SI_PC_MULTI_BLOCK = 0,
  
-       /* Each secondary selector dword follows immediately afters the
-        * corresponding primary.
-        */
-       SI_PC_MULTI_ALTERNATE = 1,
+   /* Each secondary selector dword follows immediately afters the
+    * corresponding primary.
+    */
+   SI_PC_MULTI_ALTERNATE = 1,
  
-       /* All secondary selector dwords follow as one block after all primary
-        * selector dwords.
-        */
-       SI_PC_MULTI_TAIL = 2,
+   /* All secondary selector dwords follow as one block after all primary
+    * selector dwords.
+    */
+   SI_PC_MULTI_TAIL = 2,
  
-       /* Free-form arrangement of selector registers. */
-       SI_PC_MULTI_CUSTOM = 3,
+   /* Free-form arrangement of selector registers. */
+   SI_PC_MULTI_CUSTOM = 3,
  
-       SI_PC_MULTI_MASK = 3,
+   SI_PC_MULTI_MASK = 3,
  
-       /* Registers are laid out in decreasing rather than increasing order. */
-       SI_PC_REG_REVERSE = 4,
+   /* Registers are laid out in decreasing rather than increasing order. */
+   SI_PC_REG_REVERSE = 4,
  
-       SI_PC_FAKE = 8,
+   SI_PC_FAKE = 8,
  };
  
  struct si_pc_block_base {
-       const char *name;
-       unsigned num_counters;
-       unsigned flags;
-
-       unsigned select_or;
-       unsigned select0;
-       unsigned counter0_lo;
-       unsigned *select;
-       unsigned *counters;
-       unsigned num_multi;
-       unsigned num_prelude;
-       unsigned layout;
+   const char *name;
+   unsigned num_counters;
+   unsigned flags;
+
+   unsigned select_or;
+   unsigned select0;
+   unsigned counter0_lo;
+   unsigned *select;
+   unsigned *counters;
+   unsigned num_multi;
+   unsigned num_prelude;
+   unsigned layout;
  };
  
  struct si_pc_block_gfxdescr {
-       struct si_pc_block_base *b;
-       unsigned selectors;
-       unsigned instances;
+   struct si_pc_block_base *b;
+   unsigned selectors;
+   unsigned instances;
  };
  
  struct si_pc_block {
-       const struct si_pc_block_gfxdescr *b;
-       unsigned num_instances;
+   const struct si_pc_block_gfxdescr *b;
+   unsigned num_instances;
  
-       unsigned num_groups;
-       char *group_names;
-       unsigned group_name_stride;
+   unsigned num_groups;
+   char *group_names;
+   unsigned group_name_stride;
  
-       char *selector_names;
-       unsigned selector_name_stride;
+   char *selector_names;
+   unsigned selector_name_stride;
  };
  
  /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
   * performance counter group IDs.
   */
-static const char * const si_pc_shader_type_suffixes[] = {
-       "", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS"
-};
+static const char *const si_pc_shader_type_suffixes[] = {"",    "_ES", "_GS", "_VS",
+                                                         "_PS", "_LS", "_HS", "_CS"};
  
  static const unsigned si_pc_shader_type_bits[] = {
-       0x7f,
-       S_036780_ES_EN(1),
-       S_036780_GS_EN(1),
-       S_036780_VS_EN(1),
-       S_036780_PS_EN(1),
-       S_036780_LS_EN(1),
-       S_036780_HS_EN(1),
-       S_036780_CS_EN(1),
+   0x7f,
+   S_036780_ES_EN(1),
+   S_036780_GS_EN(1),
+   S_036780_VS_EN(1),
+   S_036780_PS_EN(1),
+   S_036780_LS_EN(1),
+   S_036780_HS_EN(1),
+   S_036780_CS_EN(1),
  };
  
  /* Max counters per HW block */
@@ -129,277 +129,274 @@ static const unsigned si_pc_shader_type_bits[] = {
  #define SI_PC_SHADERS_WINDOWING (1u << 31)
  
  struct si_query_group {
-       struct si_query_group *next;
-       struct si_pc_block *block;
-       unsigned sub_gid; /* only used during init */
-       unsigned result_base; /* only used during init */
-       int se;
-       int instance;
-       unsigned num_counters;
-       unsigned selectors[SI_QUERY_MAX_COUNTERS];
+   struct si_query_group *next;
+   struct si_pc_block *block;
+   unsigned sub_gid;     /* only used during init */
+   unsigned result_base; /* only used during init */
+   int se;
+   int instance;
+   unsigned num_counters;
+   unsigned selectors[SI_QUERY_MAX_COUNTERS];
  };
  
  struct si_query_counter {
-       unsigned base;
-       unsigned qwords;
-       unsigned stride; /* in uint64s */
+   unsigned base;
+   unsigned qwords;
+   unsigned stride; /* in uint64s */
  };
  
  struct si_query_pc {
-       struct si_query b;
-       struct si_query_buffer buffer;
+   struct si_query b;
+   struct si_query_buffer buffer;
  
-       /* Size of the results in memory, in bytes. */
-       unsigned result_size;
+   /* Size of the results in memory, in bytes. */
+   unsigned result_size;
  
-       unsigned shaders;
-       unsigned num_counters;
-       struct si_query_counter *counters;
-       struct si_query_group *groups;
+   unsigned shaders;
+   unsigned num_counters;
+   struct si_query_counter *counters;
+   struct si_query_group *groups;
  };
  
-
  static struct si_pc_block_base cik_CB = {
-       .name = "CB",
-       .num_counters = 4,
-       .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
-
-       .select0 = R_037000_CB_PERFCOUNTER_FILTER,
-       .counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
-       .num_multi = 1,
-       .num_prelude = 1,
-       .layout = SI_PC_MULTI_ALTERNATE,
+   .name = "CB",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
+
+   .select0 = R_037000_CB_PERFCOUNTER_FILTER,
+   .counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .num_prelude = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
  };
  
  static unsigned cik_CPC_select[] = {
-       R_036024_CPC_PERFCOUNTER0_SELECT,
-       R_036010_CPC_PERFCOUNTER0_SELECT1,
-       R_03600C_CPC_PERFCOUNTER1_SELECT,
+   R_036024_CPC_PERFCOUNTER0_SELECT,
+   R_036010_CPC_PERFCOUNTER0_SELECT1,
+   R_03600C_CPC_PERFCOUNTER1_SELECT,
  };
  static struct si_pc_block_base cik_CPC = {
-       .name = "CPC",
-       .num_counters = 2,
+   .name = "CPC",
+   .num_counters = 2,
  
-       .select = cik_CPC_select,
-       .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
-       .num_multi = 1,
-       .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
+   .select = cik_CPC_select,
+   .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
  };
  
  static struct si_pc_block_base cik_CPF = {
-       .name = "CPF",
-       .num_counters = 2,
+   .name = "CPF",
+   .num_counters = 2,
  
-       .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
-       .num_multi = 1,
-       .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
+   .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
  };
  
  static struct si_pc_block_base cik_CPG = {
-       .name = "CPG",
-       .num_counters = 2,
+   .name = "CPG",
+   .num_counters = 2,
  
-       .select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
-       .num_multi = 1,
-       .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
+   .select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
  };
  
  static struct si_pc_block_base cik_DB = {
-       .name = "DB",
-       .num_counters = 4,
-       .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
-
-       .select0 = R_037100_DB_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
-       .num_multi = 3, // really only 2, but there's a gap between registers
-       .layout = SI_PC_MULTI_ALTERNATE,
+   .name = "DB",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
+
+   .select0 = R_037100_DB_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
+   .num_multi = 3, // really only 2, but there's a gap between registers
+   .layout = SI_PC_MULTI_ALTERNATE,
  };
  
  static struct si_pc_block_base cik_GDS = {
-       .name = "GDS",
-       .num_counters = 4,
+   .name = "GDS",
+   .num_counters = 4,
  
-       .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
-       .num_multi = 1,
-       .layout = SI_PC_MULTI_TAIL,
+   .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_TAIL,
  };
  
  static unsigned cik_GRBM_counters[] = {
-       R_034100_GRBM_PERFCOUNTER0_LO,
-       R_03410C_GRBM_PERFCOUNTER1_LO,
+   R_034100_GRBM_PERFCOUNTER0_LO,
+   R_03410C_GRBM_PERFCOUNTER1_LO,
  };
  static struct si_pc_block_base cik_GRBM = {
-       .name = "GRBM",
-       .num_counters = 2,
+   .name = "GRBM",
+   .num_counters = 2,
  
-       .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
-       .counters = cik_GRBM_counters,
+   .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
+   .counters = cik_GRBM_counters,
  };
  
  static struct si_pc_block_base cik_GRBMSE = {
-       .name = "GRBMSE",
-       .num_counters = 4,
+   .name = "GRBMSE",
+   .num_counters = 4,
  
-       .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
-       .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
+   .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
+   .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
  };
  
  static struct si_pc_block_base cik_IA = {
-       .name = "IA",
-       .num_counters = 4,
+   .name = "IA",
+   .num_counters = 4,
  
-       .select0 = R_036210_IA_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
-       .num_multi = 1,
-       .layout = SI_PC_MULTI_TAIL,
+   .select0 = R_036210_IA_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_TAIL,
  };
  
  static struct si_pc_block_base cik_PA_SC = {
-       .name = "PA_SC",
-       .num_counters = 8,
-       .flags = SI_PC_BLOCK_SE,
-
-       .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
-       .num_multi = 1,
-       .layout = SI_PC_MULTI_ALTERNATE,
+   .name = "PA_SC",
+   .num_counters = 8,
+   .flags = SI_PC_BLOCK_SE,
+
+   .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
  };
  
  /* According to docs, PA_SU counters are only 48 bits wide. */
  static struct si_pc_block_base cik_PA_SU = {
-       .name = "PA_SU",
-       .num_counters = 4,
-       .flags = SI_PC_BLOCK_SE,
-
-       .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
-       .num_multi = 2,
-       .layout = SI_PC_MULTI_ALTERNATE,
+   .name = "PA_SU",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE,
+
+   .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
+   .num_multi = 2,
+   .layout = SI_PC_MULTI_ALTERNATE,
  };
  
  static struct si_pc_block_base cik_SPI = {
-       .name = "SPI",
-       .num_counters = 6,
-       .flags = SI_PC_BLOCK_SE,
-
-       .select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
-       .num_multi = 4,
-       .layout = SI_PC_MULTI_BLOCK,
+   .name = "SPI",
+   .num_counters = 6,
+   .flags = SI_PC_BLOCK_SE,
+
+   .select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
+   .num_multi = 4,
+   .layout = SI_PC_MULTI_BLOCK,
  };
  
  static struct si_pc_block_base cik_SQ = {
-       .name = "SQ",
-       .num_counters = 16,
-       .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER,
-
-       .select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
-       .select_or = S_036700_SQC_BANK_MASK(15) |
-                       S_036700_SQC_CLIENT_MASK(15) |
-                       S_036700_SIMD_MASK(15),
-       .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
+   .name = "SQ",
+   .num_counters = 16,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER,
+
+   .select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
+   .select_or = S_036700_SQC_BANK_MASK(15) | S_036700_SQC_CLIENT_MASK(15) | S_036700_SIMD_MASK(15),
+   .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
  };
  
  static struct si_pc_block_base cik_SX = {
-       .name = "SX",
-       .num_counters = 4,
-       .flags = SI_PC_BLOCK_SE,
-
-       .select0 = R_036900_SX_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
-       .num_multi = 2,
-       .layout = SI_PC_MULTI_TAIL,
+   .name = "SX",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE,
+
+   .select0 = R_036900_SX_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
+   .num_multi = 2,
+   .layout = SI_PC_MULTI_TAIL,
  };
  
  static struct si_pc_block_base cik_TA = {
-       .name = "TA",
-       .num_counters = 2,
-       .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
-       .select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
-       .num_multi = 1,
-       .layout = SI_PC_MULTI_ALTERNATE,
+   .name = "TA",
+   .num_counters = 2,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
+
+   .select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
  };
  
  static struct si_pc_block_base cik_TD = {
-       .name = "TD",
-       .num_counters = 2,
-       .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
-       .select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
-       .num_multi = 1,
-       .layout = SI_PC_MULTI_ALTERNATE,
+   .name = "TD",
+   .num_counters = 2,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
+
+   .select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
  };
  
  static struct si_pc_block_base cik_TCA = {
-       .name = "TCA",
-       .num_counters = 4,
-       .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
-
-       .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
-       .num_multi = 2,
-       .layout = SI_PC_MULTI_ALTERNATE,
+   .name = "TCA",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
+
+   .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
+   .num_multi = 2,
+   .layout = SI_PC_MULTI_ALTERNATE,
  };
  
  static struct si_pc_block_base cik_TCC = {
-       .name = "TCC",
-       .num_counters = 4,
-       .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
-
-       .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
-       .num_multi = 2,
-       .layout = SI_PC_MULTI_ALTERNATE,
+   .name = "TCC",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
+
+   .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
+   .num_multi = 2,
+   .layout = SI_PC_MULTI_ALTERNATE,
  };
  
  static struct si_pc_block_base cik_TCP = {
-       .name = "TCP",
-       .num_counters = 4,
-       .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
-       .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
-       .num_multi = 2,
-       .layout = SI_PC_MULTI_ALTERNATE,
+   .name = "TCP",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
+
+   .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
+   .num_multi = 2,
+   .layout = SI_PC_MULTI_ALTERNATE,
  };
  
  static struct si_pc_block_base cik_VGT = {
-       .name = "VGT",
-       .num_counters = 4,
-       .flags = SI_PC_BLOCK_SE,
-
-       .select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
-       .num_multi = 1,
-       .layout = SI_PC_MULTI_TAIL,
+   .name = "VGT",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE,
+
+   .select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_TAIL,
  };
  
  static struct si_pc_block_base cik_WD = {
-       .name = "WD",
-       .num_counters = 4,
+   .name = "WD",
+   .num_counters = 4,
  
-       .select0 = R_036200_WD_PERFCOUNTER0_SELECT,
-       .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
+   .select0 = R_036200_WD_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
  };
  
  static struct si_pc_block_base cik_MC = {
-       .name = "MC",
-       .num_counters = 4,
+   .name = "MC",
+   .num_counters = 4,
  
-       .layout = SI_PC_FAKE,
+   .layout = SI_PC_FAKE,
  };
  
  static struct si_pc_block_base cik_SRBM = {
-       .name = "SRBM",
-       .num_counters = 2,
+   .name = "SRBM",
+   .num_counters = 2,
  
-       .layout = SI_PC_FAKE,
+   .layout = SI_PC_FAKE,
  };
  
  /* Both the number of instances and selectors varies between chips of the same
@@ -411,947 +408,868 @@ static struct si_pc_block_base cik_SRBM = {
   * blocks here matters.
   */
  static struct si_pc_block_gfxdescr groups_CIK[] = {
-       { &cik_CB, 226},
-       { &cik_CPF, 17 },
-       { &cik_DB, 257},
-       { &cik_GRBM, 34 },
-       { &cik_GRBMSE, 15 },
-       { &cik_PA_SU, 153 },
-       { &cik_PA_SC, 395 },
-       { &cik_SPI, 186 },
-       { &cik_SQ, 252 },
-       { &cik_SX, 32 },
-       { &cik_TA, 111, 11 },
-       { &cik_TCA, 39, 2 },
-       { &cik_TCC, 160},
-       { &cik_TD, 55, 11 },
-       { &cik_TCP, 154, 11 },
-       { &cik_GDS, 121 },
-       { &cik_VGT, 140 },
-       { &cik_IA, 22 },
-       { &cik_MC, 22 },
-       { &cik_SRBM, 19 },
-       { &cik_WD, 22 },
-       { &cik_CPG, 46 },
-       { &cik_CPC, 22 },
+   {&cik_CB, 226},     {&cik_CPF, 17},    {&cik_DB, 257},  {&cik_GRBM, 34},   {&cik_GRBMSE, 15},
+   {&cik_PA_SU, 153},  {&cik_PA_SC, 395}, {&cik_SPI, 186}, {&cik_SQ, 252},    {&cik_SX, 32},
+   {&cik_TA, 111, 11}, {&cik_TCA, 39, 2}, {&cik_TCC, 160}, {&cik_TD, 55, 11}, {&cik_TCP, 154, 11},
+   {&cik_GDS, 121},    {&cik_VGT, 140},   {&cik_IA, 22},   {&cik_MC, 22},     {&cik_SRBM, 19},
+   {&cik_WD, 22},      {&cik_CPG, 46},    {&cik_CPC, 22},
  
  };
  
  static struct si_pc_block_gfxdescr groups_VI[] = {
-       { &cik_CB, 405},
-       { &cik_CPF, 19 },
-       { &cik_DB, 257},
-       { &cik_GRBM, 34 },
-       { &cik_GRBMSE, 15 },
-       { &cik_PA_SU, 154 },
-       { &cik_PA_SC, 397 },
-       { &cik_SPI, 197 },
-       { &cik_SQ, 273 },
-       { &cik_SX, 34 },
-       { &cik_TA, 119, 16 },
-       { &cik_TCA, 35, 2 },
-       { &cik_TCC, 192},
-       { &cik_TD, 55, 16 },
-       { &cik_TCP, 180, 16 },
-       { &cik_GDS, 121 },
-       { &cik_VGT, 147 },
-       { &cik_IA, 24 },
-       { &cik_MC, 22 },
-       { &cik_SRBM, 27 },
-       { &cik_WD, 37 },
-       { &cik_CPG, 48 },
-       { &cik_CPC, 24 },
+   {&cik_CB, 405},     {&cik_CPF, 19},    {&cik_DB, 257},  {&cik_GRBM, 34},   {&cik_GRBMSE, 15},
+   {&cik_PA_SU, 154},  {&cik_PA_SC, 397}, {&cik_SPI, 197}, {&cik_SQ, 273},    {&cik_SX, 34},
+   {&cik_TA, 119, 16}, {&cik_TCA, 35, 2}, {&cik_TCC, 192}, {&cik_TD, 55, 16}, {&cik_TCP, 180, 16},
+   {&cik_GDS, 121},    {&cik_VGT, 147},   {&cik_IA, 24},   {&cik_MC, 22},     {&cik_SRBM, 27},
+   {&cik_WD, 37},      {&cik_CPG, 48},    {&cik_CPC, 24},
  
  };
  
  static struct si_pc_block_gfxdescr groups_gfx9[] = {
-       { &cik_CB, 438},
-       { &cik_CPF, 32 },
-       { &cik_DB, 328},
-       { &cik_GRBM, 38 },
-       { &cik_GRBMSE, 16 },
-       { &cik_PA_SU, 292 },
-       { &cik_PA_SC, 491 },
-       { &cik_SPI, 196 },
-       { &cik_SQ, 374 },
-       { &cik_SX, 208 },
-       { &cik_TA, 119, 16 },
-       { &cik_TCA, 35, 2 },
-       { &cik_TCC, 256},
-       { &cik_TD, 57, 16 },
-       { &cik_TCP, 85, 16 },
-       { &cik_GDS, 121 },
-       { &cik_VGT, 148 },
-       { &cik_IA, 32 },
-       { &cik_WD, 58 },
-       { &cik_CPG, 59 },
-       { &cik_CPC, 35 },
+   {&cik_CB, 438},     {&cik_CPF, 32},    {&cik_DB, 328},  {&cik_GRBM, 38},   {&cik_GRBMSE, 16},
+   {&cik_PA_SU, 292},  {&cik_PA_SC, 491}, {&cik_SPI, 196}, {&cik_SQ, 374},    {&cik_SX, 208},
+   {&cik_TA, 119, 16}, {&cik_TCA, 35, 2}, {&cik_TCC, 256}, {&cik_TD, 57, 16}, {&cik_TCP, 85, 16},
+   {&cik_GDS, 121},    {&cik_VGT, 148},   {&cik_IA, 32},   {&cik_WD, 58},     {&cik_CPG, 59},
+   {&cik_CPC, 35},
  };
  
  static bool si_pc_block_has_per_se_groups(const struct si_perfcounters *pc,
-                                         const struct si_pc_block *block)
+                                          const struct si_pc_block *block)
  {
-       return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS ||
-              (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se);
+   return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS ||
+          (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se);
  }
  
  static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters *pc,
-                                               const struct si_pc_block *block)
+                                                const struct si_pc_block *block)
  {
-       return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS ||
-              (block->num_instances > 1 && pc->separate_instance);
+   return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS ||
+          (block->num_instances > 1 && pc->separate_instance);
  }
  
-static struct si_pc_block *
-lookup_counter(struct si_perfcounters *pc, unsigned index,
-              unsigned *base_gid, unsigned *sub_index)
+static struct si_pc_block *lookup_counter(struct si_perfcounters *pc, unsigned index,
+                                          unsigned *base_gid, unsigned *sub_index)
  {
-       struct si_pc_block *block = pc->blocks;
-       unsigned bid;
+   struct si_pc_block *block = pc->blocks;
+   unsigned bid;
  
-       *base_gid = 0;
-       for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
-               unsigned total = block->num_groups * block->b->selectors;
+   *base_gid = 0;
+   for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
+      unsigned total = block->num_groups * block->b->selectors;
  
-               if (index < total) {
-                       *sub_index = index;
-                       return block;
-               }
+      if (index < total) {
+         *sub_index = index;
+         return block;
+      }
  
-               index -= total;
-               *base_gid += block->num_groups;
-       }
+      index -= total;
+      *base_gid += block->num_groups;
+   }
  
-       return NULL;
+   return NULL;
  }
  
-static struct si_pc_block *
-lookup_group(struct si_perfcounters *pc, unsigned *index)
+static struct si_pc_block *lookup_group(struct si_perfcounters *pc, unsigned *index)
  {
-       unsigned bid;
-       struct si_pc_block *block = pc->blocks;
+   unsigned bid;
+   struct si_pc_block *block = pc->blocks;
  
-       for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
-               if (*index < block->num_groups)
-                       return block;
-               *index -= block->num_groups;
-       }
+   for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
+      if (*index < block->num_groups)
+         return block;
+      *index -= block->num_groups;
+   }
  
-       return NULL;
+   return NULL;
  }
  
-static void si_pc_emit_instance(struct si_context *sctx,
-                               int se, int instance)
+static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       unsigned value = S_030800_SH_BROADCAST_WRITES(1);
-
-       if (se >= 0) {
-               value |= S_030800_SE_INDEX(se);
-       } else {
-               value |= S_030800_SE_BROADCAST_WRITES(1);
-       }
-
-       if (instance >= 0) {
-               value |= S_030800_INSTANCE_INDEX(instance);
-       } else {
-               value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
-       }
-
-       radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned value = S_030800_SH_BROADCAST_WRITES(1);
+
+   if (se >= 0) {
+      value |= S_030800_SE_INDEX(se);
+   } else {
+      value |= S_030800_SE_BROADCAST_WRITES(1);
+   }
+
+   if (instance >= 0) {
+      value |= S_030800_INSTANCE_INDEX(instance);
+   } else {
+      value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
+   }
+
+   radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
  }
  
-static void si_pc_emit_shaders(struct si_context *sctx,
-                              unsigned shaders)
+static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
  
-       radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
-       radeon_emit(cs, shaders & 0x7f);
-       radeon_emit(cs, 0xffffffff);
+   radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
+   radeon_emit(cs, shaders & 0x7f);
+   radeon_emit(cs, 0xffffffff);
  }
  
-static void si_pc_emit_select(struct si_context *sctx,
-                       struct si_pc_block *block,
-                       unsigned count, unsigned *selectors)
+static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block, unsigned count,
+                              unsigned *selectors)
  {
-       struct si_pc_block_base *regs = block->b->b;
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       unsigned idx;
-       unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
-       unsigned dw;
-
-       assert(count <= regs->num_counters);
-
-       if (regs->layout & SI_PC_FAKE)
-               return;
-
-       if (layout_multi == SI_PC_MULTI_BLOCK) {
-               assert(!(regs->layout & SI_PC_REG_REVERSE));
-
-               dw = count + regs->num_prelude;
-               if (count >= regs->num_multi)
-                       dw += regs->num_multi;
-               radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
-               for (idx = 0; idx < regs->num_prelude; ++idx)
-                       radeon_emit(cs, 0);
-               for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
-                       radeon_emit(cs, selectors[idx] | regs->select_or);
-
-               if (count < regs->num_multi) {
-                       unsigned select1 =
-                               regs->select0 + 4 * regs->num_multi;
-                       radeon_set_uconfig_reg_seq(cs, select1, count);
-               }
-
-               for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
-                       radeon_emit(cs, 0);
-
-               if (count > regs->num_multi) {
-                       for (idx = regs->num_multi; idx < count; ++idx)
-                               radeon_emit(cs, selectors[idx] | regs->select_or);
-               }
-       } else if (layout_multi == SI_PC_MULTI_TAIL) {
-               unsigned select1, select1_count;
-
-               assert(!(regs->layout & SI_PC_REG_REVERSE));
-
-               radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
-               for (idx = 0; idx < regs->num_prelude; ++idx)
-                       radeon_emit(cs, 0);
-               for (idx = 0; idx < count; ++idx)
-                       radeon_emit(cs, selectors[idx] | regs->select_or);
-
-               select1 = regs->select0 + 4 * regs->num_counters;
-               select1_count = MIN2(count, regs->num_multi);
-               radeon_set_uconfig_reg_seq(cs, select1, select1_count);
-               for (idx = 0; idx < select1_count; ++idx)
-                       radeon_emit(cs, 0);
-       } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
-               unsigned *reg = regs->select;
-               for (idx = 0; idx < count; ++idx) {
-                       radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
-                       if (idx < regs->num_multi)
-                               radeon_set_uconfig_reg(cs, *reg++, 0);
-               }
-       } else {
-               assert(layout_multi == SI_PC_MULTI_ALTERNATE);
-
-               unsigned reg_base = regs->select0;
-               unsigned reg_count = count + MIN2(count, regs->num_multi);
-               reg_count += regs->num_prelude;
-
-               if (!(regs->layout & SI_PC_REG_REVERSE)) {
-                       radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
-
-                       for (idx = 0; idx < regs->num_prelude; ++idx)
-                               radeon_emit(cs, 0);
-                       for (idx = 0; idx < count; ++idx) {
-                               radeon_emit(cs, selectors[idx] | regs->select_or);
-                               if (idx < regs->num_multi)
-                                       radeon_emit(cs, 0);
-                       }
-               } else {
-                       reg_base -= (reg_count - 1) * 4;
-                       radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
-
-                       for (idx = count; idx > 0; --idx) {
-                               if (idx <= regs->num_multi)
-                                       radeon_emit(cs, 0);
-                               radeon_emit(cs, selectors[idx - 1] | regs->select_or);
-                       }
-                       for (idx = 0; idx < regs->num_prelude; ++idx)
-                               radeon_emit(cs, 0);
-               }
-       }
+   struct si_pc_block_base *regs = block->b->b;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned idx;
+   unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
+   unsigned dw;
+
+   assert(count <= regs->num_counters);
+
+   if (regs->layout & SI_PC_FAKE)
+      return;
+
+   if (layout_multi == SI_PC_MULTI_BLOCK) {
+      assert(!(regs->layout & SI_PC_REG_REVERSE));
+
+      dw = count + regs->num_prelude;
+      if (count >= regs->num_multi)
+         dw += regs->num_multi;
+      radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
+      for (idx = 0; idx < regs->num_prelude; ++idx)
+         radeon_emit(cs, 0);
+      for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
+         radeon_emit(cs, selectors[idx] | regs->select_or);
+
+      if (count < regs->num_multi) {
+         unsigned select1 = regs->select0 + 4 * regs->num_multi;
+         radeon_set_uconfig_reg_seq(cs, select1, count);
+      }
+
+      for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
+         radeon_emit(cs, 0);
+
+      if (count > regs->num_multi) {
+         for (idx = regs->num_multi; idx < count; ++idx)
+            radeon_emit(cs, selectors[idx] | regs->select_or);
+      }
+   } else if (layout_multi == SI_PC_MULTI_TAIL) {
+      unsigned select1, select1_count;
+
+      assert(!(regs->layout & SI_PC_REG_REVERSE));
+
+      radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
+      for (idx = 0; idx < regs->num_prelude; ++idx)
+         radeon_emit(cs, 0);
+      for (idx = 0; idx < count; ++idx)
+         radeon_emit(cs, selectors[idx] | regs->select_or);
+
+      select1 = regs->select0 + 4 * regs->num_counters;
+      select1_count = MIN2(count, regs->num_multi);
+      radeon_set_uconfig_reg_seq(cs, select1, select1_count);
+      for (idx = 0; idx < select1_count; ++idx)
+         radeon_emit(cs, 0);
+   } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
+      unsigned *reg = regs->select;
+      for (idx = 0; idx < count; ++idx) {
+         radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
+         if (idx < regs->num_multi)
+            radeon_set_uconfig_reg(cs, *reg++, 0);
+      }
+   } else {
+      assert(layout_multi == SI_PC_MULTI_ALTERNATE);
+
+      unsigned reg_base = regs->select0;
+      unsigned reg_count = count + MIN2(count, regs->num_multi);
+      reg_count += regs->num_prelude;
+
+      if (!(regs->layout & SI_PC_REG_REVERSE)) {
+         radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
+
+         for (idx = 0; idx < regs->num_prelude; ++idx)
+            radeon_emit(cs, 0);
+         for (idx = 0; idx < count; ++idx) {
+            radeon_emit(cs, selectors[idx] | regs->select_or);
+            if (idx < regs->num_multi)
+               radeon_emit(cs, 0);
+         }
+      } else {
+         reg_base -= (reg_count - 1) * 4;
+         radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
+
+         for (idx = count; idx > 0; --idx) {
+            if (idx <= regs->num_multi)
+               radeon_emit(cs, 0);
+            radeon_emit(cs, selectors[idx - 1] | regs->select_or);
+         }
+         for (idx = 0; idx < regs->num_prelude; ++idx)
+            radeon_emit(cs, 0);
+      }
+   }
  }
  
-static void si_pc_emit_start(struct si_context *sctx,
-                            struct si_resource *buffer, uint64_t va)
+static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-       si_cp_copy_data(sctx, sctx->gfx_cs,
-                       COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
-                       COPY_DATA_IMM, NULL, 1);
-
-       radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
-                              S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
-       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-       radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
-       radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
-                              S_036020_PERFMON_STATE(V_036020_START_COUNTING));
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
+                   COPY_DATA_IMM, NULL, 1);
+
+   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+                          S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
+   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+                          S_036020_PERFMON_STATE(V_036020_START_COUNTING));
  }
  
  /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
   * do it again in here. */
-static void si_pc_emit_stop(struct si_context *sctx,
-                           struct si_resource *buffer, uint64_t va)
+static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
-                         EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-                         EOP_DATA_SEL_VALUE_32BIT,
-                         buffer, va, 0, SI_NOT_QUERY);
-       si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
-
-       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-       radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
-       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-       radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
-       radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
-                              S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) |
-                              S_036020_PERFMON_SAMPLE_ENABLE(1));
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+                     EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
+   si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
+
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
+   radeon_set_uconfig_reg(
+      cs, R_036020_CP_PERFMON_CNTL,
+      S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1));
  }
  
-static void si_pc_emit_read(struct si_context *sctx,
-                           struct si_pc_block *block,
-                           unsigned count, uint64_t va)
+static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, unsigned count,
+                            uint64_t va)
  {
-       struct si_pc_block_base *regs = block->b->b;
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       unsigned idx;
-       unsigned reg = regs->counter0_lo;
-       unsigned reg_delta = 8;
-
-       if (!(regs->layout & SI_PC_FAKE)) {
-               if (regs->layout & SI_PC_REG_REVERSE)
-                       reg_delta = -reg_delta;
-
-               for (idx = 0; idx < count; ++idx) {
-                       if (regs->counters)
-                               reg = regs->counters[idx];
-
-                       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-                       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
-                                       COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
-                                       COPY_DATA_COUNT_SEL); /* 64 bits */
-                       radeon_emit(cs, reg >> 2);
-                       radeon_emit(cs, 0); /* unused */
-                       radeon_emit(cs, va);
-                       radeon_emit(cs, va >> 32);
-                       va += sizeof(uint64_t);
-                       reg += reg_delta;
-               }
-       } else {
-               for (idx = 0; idx < count; ++idx) {
-                       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-                       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
-                                       COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
-                                       COPY_DATA_COUNT_SEL);
-                       radeon_emit(cs, 0); /* immediate */
-                       radeon_emit(cs, 0);
-                       radeon_emit(cs, va);
-                       radeon_emit(cs, va >> 32);
-                       va += sizeof(uint64_t);
-               }
-       }
+   struct si_pc_block_base *regs = block->b->b;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned idx;
+   unsigned reg = regs->counter0_lo;
+   unsigned reg_delta = 8;
+
+   if (!(regs->layout & SI_PC_FAKE)) {
+      if (regs->layout & SI_PC_REG_REVERSE)
+         reg_delta = -reg_delta;
+
+      for (idx = 0; idx < count; ++idx) {
+         if (regs->counters)
+            reg = regs->counters[idx];
+
+         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+                            COPY_DATA_COUNT_SEL); /* 64 bits */
+         radeon_emit(cs, reg >> 2);
+         radeon_emit(cs, 0); /* unused */
+         radeon_emit(cs, va);
+         radeon_emit(cs, va >> 32);
+         va += sizeof(uint64_t);
+         reg += reg_delta;
+      }
+   } else {
+      for (idx = 0; idx < count; ++idx) {
+         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+                            COPY_DATA_COUNT_SEL);
+         radeon_emit(cs, 0); /* immediate */
+         radeon_emit(cs, 0);
+         radeon_emit(cs, va);
+         radeon_emit(cs, va >> 32);
+         va += sizeof(uint64_t);
+      }
+   }
  }
  
-static void si_pc_query_destroy(struct si_context *sctx,
-                               struct si_query *squery)
+static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
  {
-       struct si_query_pc *query = (struct si_query_pc *)squery;
+   struct si_query_pc *query = (struct si_query_pc *)squery;
  
-       while (query->groups) {
-               struct si_query_group *group = query->groups;
-               query->groups = group->next;
-               FREE(group);
-       }
+   while (query->groups) {
+      struct si_query_group *group = query->groups;
+      query->groups = group->next;
+      FREE(group);
+   }
  
-       FREE(query->counters);
+   FREE(query->counters);
  
-       si_query_buffer_destroy(sctx->screen, &query->buffer);
-       FREE(query);
+   si_query_buffer_destroy(sctx->screen, &query->buffer);
+   FREE(query);
  }
  
  static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
  /*
-                                  struct si_query_hw *hwquery,
-                                  struct si_resource *buffer, uint64_t va)*/
+                                   struct si_query_hw *hwquery,
+                                   struct si_resource *buffer, uint64_t va)*/
  {
-       struct si_query_pc *query = (struct si_query_pc *)squery;
-       int current_se = -1;
-       int current_instance = -1;
+   struct si_query_pc *query = (struct si_query_pc *)squery;
+   int current_se = -1;
+   int current_instance = -1;
  
-       if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
-               return;
-       si_need_gfx_cs_space(sctx);
+   if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
+      return;
+   si_need_gfx_cs_space(sctx);
  
-       if (query->shaders)
-               si_pc_emit_shaders(sctx, query->shaders);
+   if (query->shaders)
+      si_pc_emit_shaders(sctx, query->shaders);
  
-       for (struct si_query_group *group = query->groups; group; group = group->next) {
-               struct si_pc_block *block = group->block;
+   for (struct si_query_group *group = query->groups; group; group = group->next) {
+      struct si_pc_block *block = group->block;
  
-               if (group->se != current_se || group->instance != current_instance) {
-                       current_se = group->se;
-                       current_instance = group->instance;
-                       si_pc_emit_instance(sctx, group->se, group->instance);
-               }
+      if (group->se != current_se || group->instance != current_instance) {
+         current_se = group->se;
+         current_instance = group->instance;
+         si_pc_emit_instance(sctx, group->se, group->instance);
+      }
  
-               si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
-       }
+      si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
+   }
  
-       if (current_se != -1 || current_instance != -1)
-               si_pc_emit_instance(sctx, -1, -1);
+   if (current_se != -1 || current_instance != -1)
+      si_pc_emit_instance(sctx, -1, -1);
  
-       uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
-       si_pc_emit_start(sctx, query->buffer.buf, va);
+   uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
+   si_pc_emit_start(sctx, query->buffer.buf, va);
  }
  
  static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
  {
-       struct si_query_pc *query = (struct si_query_pc *)squery;
+   struct si_query_pc *query = (struct si_query_pc *)squery;
  
-       if (!query->buffer.buf)
-               return;
+   if (!query->buffer.buf)
+      return;
  
-       uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
-       query->buffer.results_end += query->result_size;
+   uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
+   query->buffer.results_end += query->result_size;
  
-       si_pc_emit_stop(sctx, query->buffer.buf, va);
+   si_pc_emit_stop(sctx, query->buffer.buf, va);
  
-       for (struct si_query_group *group = query->groups; group; group = group->next) {
-               struct si_pc_block *block = group->block;
-               unsigned se = group->se >= 0 ? group->se : 0;
-               unsigned se_end = se + 1;
+   for (struct si_query_group *group = query->groups; group; group = group->next) {
+      struct si_pc_block *block = group->block;
+      unsigned se = group->se >= 0 ? group->se : 0;
+      unsigned se_end = se + 1;
  
-               if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
-                       se_end = sctx->screen->info.max_se;
+      if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
+         se_end = sctx->screen->info.max_se;
  
-               do {
-                       unsigned instance = group->instance >= 0 ? group->instance : 0;
+      do {
+         unsigned instance = group->instance >= 0 ? group->instance : 0;
  
-                       do {
-                               si_pc_emit_instance(sctx, se, instance);
-                               si_pc_emit_read(sctx, block, group->num_counters, va);
-                               va += sizeof(uint64_t) * group->num_counters;
-                       } while (group->instance < 0 && ++instance < block->num_instances);
-               } while (++se < se_end);
-       }
+         do {
+            si_pc_emit_instance(sctx, se, instance);
+            si_pc_emit_read(sctx, block, group->num_counters, va);
+            va += sizeof(uint64_t) * group->num_counters;
+         } while (group->instance < 0 && ++instance < block->num_instances);
+      } while (++se < se_end);
+   }
  
-       si_pc_emit_instance(sctx, -1, -1);
+   si_pc_emit_instance(sctx, -1, -1);
  }
  
  static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
  {
-       struct si_query_pc *query = (struct si_query_pc *)squery;
+   struct si_query_pc *query = (struct si_query_pc *)squery;
  
-       si_query_buffer_reset(ctx, &query->buffer);
+   si_query_buffer_reset(ctx, &query->buffer);
  
-       list_addtail(&query->b.active_list, &ctx->active_queries);
-       ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
+   list_addtail(&query->b.active_list, &ctx->active_queries);
+   ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
  
-       si_pc_query_resume(ctx, squery);
+   si_pc_query_resume(ctx, squery);
  
-       return true;
+   return true;
  }
  
  static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
  {
-       struct si_query_pc *query = (struct si_query_pc *)squery;
+   struct si_query_pc *query = (struct si_query_pc *)squery;
  
-       si_pc_query_suspend(ctx, squery);
+   si_pc_query_suspend(ctx, squery);
  
-       list_del(&squery->active_list);
-       ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
+   list_del(&squery->active_list);
+   ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
  
-       return query->buffer.buf != NULL;
+   return query->buffer.buf != NULL;
  }
  
-static void si_pc_query_add_result(struct si_query_pc *query,
-                                  void *buffer,
-                                  union pipe_query_result *result)
+static void si_pc_query_add_result(struct si_query_pc *query, void *buffer,
+                                   union pipe_query_result *result)
  {
-       uint64_t *results = buffer;
-       unsigned i, j;
+   uint64_t *results = buffer;
+   unsigned i, j;
  
-       for (i = 0; i < query->num_counters; ++i) {
-               struct si_query_counter *counter = &query->counters[i];
+   for (i = 0; i < query->num_counters; ++i) {
+      struct si_query_counter *counter = &query->counters[i];
  
-               for (j = 0; j < counter->qwords; ++j) {
-                       uint32_t value = results[counter->base + j * counter->stride];
-                       result->batch[i].u64 += value;
-               }
-       }
+      for (j = 0; j < counter->qwords; ++j) {
+         uint32_t value = results[counter->base + j * counter->stride];
+         result->batch[i].u64 += value;
+      }
+   }
  }
  
-static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery,
-                                  bool wait, union pipe_query_result *result)
+static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
+                                   union pipe_query_result *result)
  {
-       struct si_query_pc *query = (struct si_query_pc *)squery;
+   struct si_query_pc *query = (struct si_query_pc *)squery;
  
-       memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
+   memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
  
-       for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
-               unsigned usage = PIPE_TRANSFER_READ |
-                                (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
-               unsigned results_base = 0;
-               void *map;
+   for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+      unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+      unsigned results_base = 0;
+      void *map;
  
-               if (squery->b.flushed)
-                       map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
-               else
-                       map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+      if (squery->b.flushed)
+         map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+      else
+         map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
  
-               if (!map)
-                       return false;
+      if (!map)
+         return false;
  
-               while (results_base != qbuf->results_end) {
-                       si_pc_query_add_result(query, map + results_base, result);
-                       results_base += query->result_size;
-               }
-       }
+      while (results_base != qbuf->results_end) {
+         si_pc_query_add_result(query, map + results_base, result);
+         results_base += query->result_size;
+      }
+   }
  
-       return true;
+   return true;
  }
  
  static const struct si_query_ops batch_query_ops = {
-       .destroy = si_pc_query_destroy,
-       .begin = si_pc_query_begin,
-       .end = si_pc_query_end,
-       .get_result = si_pc_query_get_result,
+   .destroy = si_pc_query_destroy,
+   .begin = si_pc_query_begin,
+   .end = si_pc_query_end,
+   .get_result = si_pc_query_get_result,
  
-       .suspend = si_pc_query_suspend,
-       .resume = si_pc_query_resume,
+   .suspend = si_pc_query_suspend,
+   .resume = si_pc_query_resume,
  };
  
-static struct si_query_group *get_group_state(struct si_screen *screen,
-                                             struct si_query_pc *query,
-                                             struct si_pc_block *block,
-                                             unsigned sub_gid)
+static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
+                                              struct si_pc_block *block, unsigned sub_gid)
  {
-       struct si_query_group *group = query->groups;
-
-       while (group) {
-               if (group->block == block && group->sub_gid == sub_gid)
-                       return group;
-               group = group->next;
-       }
-
-       group = CALLOC_STRUCT(si_query_group);
-       if (!group)
-               return NULL;
-
-       group->block = block;
-       group->sub_gid = sub_gid;
-
-       if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
-               unsigned sub_gids = block->num_instances;
-               unsigned shader_id;
-               unsigned shaders;
-               unsigned query_shaders;
-
-               if (si_pc_block_has_per_se_groups(screen->perfcounters, block))
-                       sub_gids = sub_gids * screen->info.max_se;
-               shader_id = sub_gid / sub_gids;
-               sub_gid = sub_gid % sub_gids;
-
-               shaders = si_pc_shader_type_bits[shader_id];
-
-               query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
-               if (query_shaders && query_shaders != shaders) {
-                       fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
-                       FREE(group);
-                       return NULL;
-               }
-               query->shaders = shaders;
-       }
-
-       if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
-               // A non-zero value in query->shaders ensures that the shader
-               // masking is reset unless the user explicitly requests one.
-               query->shaders = SI_PC_SHADERS_WINDOWING;
-       }
-
-       if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) {
-               group->se = sub_gid / block->num_instances;
-               sub_gid = sub_gid % block->num_instances;
-       } else {
-               group->se = -1;
-       }
-
-       if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) {
-               group->instance = sub_gid;
-       } else {
-               group->instance = -1;
-       }
-
-       group->next = query->groups;
-       query->groups = group;
-
-       return group;
+   struct si_query_group *group = query->groups;
+
+   while (group) {
+      if (group->block == block && group->sub_gid == sub_gid)
+         return group;
+      group = group->next;
+   }
+
+   group = CALLOC_STRUCT(si_query_group);
+   if (!group)
+      return NULL;
+
+   group->block = block;
+   group->sub_gid = sub_gid;
+
+   if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
+      unsigned sub_gids = block->num_instances;
+      unsigned shader_id;
+      unsigned shaders;
+      unsigned query_shaders;
+
+      if (si_pc_block_has_per_se_groups(screen->perfcounters, block))
+         sub_gids = sub_gids * screen->info.max_se;
+      shader_id = sub_gid / sub_gids;
+      sub_gid = sub_gid % sub_gids;
+
+      shaders = si_pc_shader_type_bits[shader_id];
+
+      query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
+      if (query_shaders && query_shaders != shaders) {
+         fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
+         FREE(group);
+         return NULL;
+      }
+      query->shaders = shaders;
+   }
+
+   if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
+      // A non-zero value in query->shaders ensures that the shader
+      // masking is reset unless the user explicitly requests one.
+      query->shaders = SI_PC_SHADERS_WINDOWING;
+   }
+
+   if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) {
+      group->se = sub_gid / block->num_instances;
+      sub_gid = sub_gid % block->num_instances;
+   } else {
+      group->se = -1;
+   }
+
+   if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) {
+      group->instance = sub_gid;
+   } else {
+      group->instance = -1;
+   }
+
+   group->next = query->groups;
+   query->groups = group;
+
+   return group;
  }
  
-struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
-                                        unsigned num_queries,
-                                        unsigned *query_types)
+struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
+                                         unsigned *query_types)
  {
-       struct si_screen *screen =
-               (struct si_screen *)ctx->screen;
-       struct si_perfcounters *pc = screen->perfcounters;
-       struct si_pc_block *block;
-       struct si_query_group *group;
-       struct si_query_pc *query;
-       unsigned base_gid, sub_gid, sub_index;
-       unsigned i, j;
-
-       if (!pc)
-               return NULL;
-
-       query = CALLOC_STRUCT(si_query_pc);
-       if (!query)
-               return NULL;
-
-       query->b.ops = &batch_query_ops;
-
-       query->num_counters = num_queries;
-
-       /* Collect selectors per group */
-       for (i = 0; i < num_queries; ++i) {
-               unsigned sub_gid;
-
-               if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
-                       goto error;
-
-               block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER,
-                                      &base_gid, &sub_index);
-               if (!block)
-                       goto error;
-
-               sub_gid = sub_index / block->b->selectors;
-               sub_index = sub_index % block->b->selectors;
-
-               group = get_group_state(screen, query, block, sub_gid);
-               if (!group)
-                       goto error;
-
-               if (group->num_counters >= block->b->b->num_counters) {
-                       fprintf(stderr,
-                               "perfcounter group %s: too many selected\n",
-                               block->b->b->name);
-                       goto error;
-               }
-               group->selectors[group->num_counters] = sub_index;
-               ++group->num_counters;
-       }
-
-       /* Compute result bases and CS size per group */
-       query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
-       query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
-
-       i = 0;
-       for (group = query->groups; group; group = group->next) {
-               struct si_pc_block *block = group->block;
-               unsigned read_dw;
-               unsigned instances = 1;
-
-               if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
-                       instances = screen->info.max_se;
-               if (group->instance < 0)
-                       instances *= block->num_instances;
-
-               group->result_base = i;
-               query->result_size += sizeof(uint64_t) * instances * group->num_counters;
-               i += instances * group->num_counters;
-
-               read_dw = 6 * group->num_counters;
-               query->b.num_cs_dw_suspend += instances * read_dw;
-               query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
-       }
-
-       if (query->shaders) {
-               if (query->shaders == SI_PC_SHADERS_WINDOWING)
-                       query->shaders = 0xffffffff;
-       }
-
-       /* Map user-supplied query array to result indices */
-       query->counters = CALLOC(num_queries, sizeof(*query->counters));
-       for (i = 0; i < num_queries; ++i) {
-               struct si_query_counter *counter = &query->counters[i];
-               struct si_pc_block *block;
-
-               block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER,
-                                      &base_gid, &sub_index);
-
-               sub_gid = sub_index / block->b->selectors;
-               sub_index = sub_index % block->b->selectors;
-
-               group = get_group_state(screen, query, block, sub_gid);
-               assert(group != NULL);
-
-               for (j = 0; j < group->num_counters; ++j) {
-                       if (group->selectors[j] == sub_index)
-                               break;
-               }
-
-               counter->base = group->result_base + j;
-               counter->stride = group->num_counters;
-
-               counter->qwords = 1;
-               if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
-                       counter->qwords = screen->info.max_se;
-               if (group->instance < 0)
-                       counter->qwords *= block->num_instances;
-       }
+   struct si_screen *screen = (struct si_screen *)ctx->screen;
+   struct si_perfcounters *pc = screen->perfcounters;
+   struct si_pc_block *block;
+   struct si_query_group *group;
+   struct si_query_pc *query;
+   unsigned base_gid, sub_gid, sub_index;
+   unsigned i, j;
+
+   if (!pc)
+      return NULL;
+
+   query = CALLOC_STRUCT(si_query_pc);
+   if (!query)
+      return NULL;
+
+   query->b.ops = &batch_query_ops;
+
+   query->num_counters = num_queries;
+
+   /* Collect selectors per group */
+   for (i = 0; i < num_queries; ++i) {
+      unsigned sub_gid;
+
+      if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
+         goto error;
+
+      block =
+         lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
+      if (!block)
+         goto error;
+
+      sub_gid = sub_index / block->b->selectors;
+      sub_index = sub_index % block->b->selectors;
+
+      group = get_group_state(screen, query, block, sub_gid);
+      if (!group)
+         goto error;
+
+      if (group->num_counters >= block->b->b->num_counters) {
+         fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name);
+         goto error;
+      }
+      group->selectors[group->num_counters] = sub_index;
+      ++group->num_counters;
+   }
+
+   /* Compute result bases and CS size per group */
+   query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
+   query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
+
+   i = 0;
+   for (group = query->groups; group; group = group->next) {
+      struct si_pc_block *block = group->block;
+      unsigned read_dw;
+      unsigned instances = 1;
+
+      if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
+         instances = screen->info.max_se;
+      if (group->instance < 0)
+         instances *= block->num_instances;
+
+      group->result_base = i;
+      query->result_size += sizeof(uint64_t) * instances * group->num_counters;
+      i += instances * group->num_counters;
+
+      read_dw = 6 * group->num_counters;
+      query->b.num_cs_dw_suspend += instances * read_dw;
+      query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
+   }
+
+   if (query->shaders) {
+      if (query->shaders == SI_PC_SHADERS_WINDOWING)
+         query->shaders = 0xffffffff;
+   }
+
+   /* Map user-supplied query array to result indices */
+   query->counters = CALLOC(num_queries, sizeof(*query->counters));
+   for (i = 0; i < num_queries; ++i) {
+      struct si_query_counter *counter = &query->counters[i];
+      struct si_pc_block *block;
+
+      block =
+         lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
+
+      sub_gid = sub_index / block->b->selectors;
+      sub_index = sub_index % block->b->selectors;
+
+      group = get_group_state(screen, query, block, sub_gid);
+      assert(group != NULL);
+
+      for (j = 0; j < group->num_counters; ++j) {
+         if (group->selectors[j] == sub_index)
+            break;
+      }
+
+      counter->base = group->result_base + j;
+      counter->stride = group->num_counters;
+
+      counter->qwords = 1;
+      if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
+         counter->qwords = screen->info.max_se;
+      if (group->instance < 0)
+         counter->qwords *= block->num_instances;
+   }
  
-       return (struct pipe_query *)query;
+   return (struct pipe_query *)query;
  
  error:
-       si_pc_query_destroy((struct si_context *)ctx, &query->b);
-       return NULL;
+   si_pc_query_destroy((struct si_context *)ctx, &query->b);
+   return NULL;
  }
  
-static bool si_init_block_names(struct si_screen *screen,
-                               struct si_pc_block *block)
+static bool si_init_block_names(struct si_screen *screen, struct si_pc_block *block)
  {
-       bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
-       bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
-       unsigned i, j, k;
-       unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
-       unsigned namelen;
-       char *groupname;
-       char *p;
-
-       if (per_instance_groups)
-               groups_instance = block->num_instances;
-       if (per_se_groups)
-               groups_se = screen->info.max_se;
-       if (block->b->b->flags & SI_PC_BLOCK_SHADER)
-               groups_shader = ARRAY_SIZE(si_pc_shader_type_bits);
-
-       namelen = strlen(block->b->b->name);
-       block->group_name_stride = namelen + 1;
-       if (block->b->b->flags & SI_PC_BLOCK_SHADER)
-               block->group_name_stride += 3;
-       if (per_se_groups) {
-               assert(groups_se <= 10);
-               block->group_name_stride += 1;
-
-               if (per_instance_groups)
-                       block->group_name_stride += 1;
-       }
-       if (per_instance_groups) {
-               assert(groups_instance <= 100);
-               block->group_name_stride += 2;
-       }
-
-       block->group_names = MALLOC(block->num_groups * block->group_name_stride);
-       if (!block->group_names)
-               return false;
-
-       groupname = block->group_names;
-       for (i = 0; i < groups_shader; ++i) {
-               const char *shader_suffix = si_pc_shader_type_suffixes[i];
-               unsigned shaderlen = strlen(shader_suffix);
-               for (j = 0; j < groups_se; ++j) {
-                       for (k = 0; k < groups_instance; ++k) {
-                               strcpy(groupname, block->b->b->name);
-                               p = groupname + namelen;
-
-                               if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
-                                       strcpy(p, shader_suffix);
-                                       p += shaderlen;
-                               }
-
-                               if (per_se_groups) {
-                                       p += sprintf(p, "%d", j);
-                                       if (per_instance_groups)
-                                               *p++ = '_';
-                               }
-
-                               if (per_instance_groups)
-                                       p += sprintf(p, "%d", k);
-
-                               groupname += block->group_name_stride;
-                       }
-               }
-       }
-
-       assert(block->b->selectors <= 1000);
-       block->selector_name_stride = block->group_name_stride + 4;
-       block->selector_names = MALLOC(block->num_groups * block->b->selectors *
-                                      block->selector_name_stride);
-       if (!block->selector_names)
-               return false;
-
-       groupname = block->group_names;
-       p = block->selector_names;
-       for (i = 0; i < block->num_groups; ++i) {
-               for (j = 0; j < block->b->selectors; ++j) {
-                       sprintf(p, "%s_%03d", groupname, j);
-                       p += block->selector_name_stride;
-               }
-               groupname += block->group_name_stride;
-       }
-
-       return true;
+   bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
+   bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
+   unsigned i, j, k;
+   unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
+   unsigned namelen;
+   char *groupname;
+   char *p;
+
+   if (per_instance_groups)
+      groups_instance = block->num_instances;
+   if (per_se_groups)
+      groups_se = screen->info.max_se;
+   if (block->b->b->flags & SI_PC_BLOCK_SHADER)
+      groups_shader = ARRAY_SIZE(si_pc_shader_type_bits);
+
+   namelen = strlen(block->b->b->name);
+   block->group_name_stride = namelen + 1;
+   if (block->b->b->flags & SI_PC_BLOCK_SHADER)
+      block->group_name_stride += 3;
+   if (per_se_groups) {
+      assert(groups_se <= 10);
+      block->group_name_stride += 1;
+
+      if (per_instance_groups)
+         block->group_name_stride += 1;
+   }
+   if (per_instance_groups) {
+      assert(groups_instance <= 100);
+      block->group_name_stride += 2;
+   }
+
+   block->group_names = MALLOC(block->num_groups * block->group_name_stride);
+   if (!block->group_names)
+      return false;
+
+   groupname = block->group_names;
+   for (i = 0; i < groups_shader; ++i) {
+      const char *shader_suffix = si_pc_shader_type_suffixes[i];
+      unsigned shaderlen = strlen(shader_suffix);
+      for (j = 0; j < groups_se; ++j) {
+         for (k = 0; k < groups_instance; ++k) {
+            strcpy(groupname, block->b->b->name);
+            p = groupname + namelen;
+
+            if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
+               strcpy(p, shader_suffix);
+               p += shaderlen;
+            }
+
+            if (per_se_groups) {
+               p += sprintf(p, "%d", j);
+               if (per_instance_groups)
+                  *p++ = '_';
+            }
+
+            if (per_instance_groups)
+               p += sprintf(p, "%d", k);
+
+            groupname += block->group_name_stride;
+         }
+      }
+   }
+
+   assert(block->b->selectors <= 1000);
+   block->selector_name_stride = block->group_name_stride + 4;
+   block->selector_names =
+      MALLOC(block->num_groups * block->b->selectors * block->selector_name_stride);
+   if (!block->selector_names)
+      return false;
+
+   groupname = block->group_names;
+   p = block->selector_names;
+   for (i = 0; i < block->num_groups; ++i) {
+      for (j = 0; j < block->b->selectors; ++j) {
+         sprintf(p, "%s_%03d", groupname, j);
+         p += block->selector_name_stride;
+      }
+      groupname += block->group_name_stride;
+   }
+
+   return true;
  }
  
-int si_get_perfcounter_info(struct si_screen *screen,
-                           unsigned index,
-                           struct pipe_driver_query_info *info)
+int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
+                            struct pipe_driver_query_info *info)
  {
-       struct si_perfcounters *pc = screen->perfcounters;
-       struct si_pc_block *block;
-       unsigned base_gid, sub;
-
-       if (!pc)
-               return 0;
-
-       if (!info) {
-               unsigned bid, num_queries = 0;
-
-               for (bid = 0; bid < pc->num_blocks; ++bid) {
-                       num_queries += pc->blocks[bid].b->selectors *
-                                      pc->blocks[bid].num_groups;
-               }
-
-               return num_queries;
-       }
-
-       block = lookup_counter(pc, index, &base_gid, &sub);
-       if (!block)
-               return 0;
-
-       if (!block->selector_names) {
-               if (!si_init_block_names(screen, block))
-                       return 0;
-       }
-       info->name = block->selector_names + sub * block->selector_name_stride;
-       info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
-       info->max_value.u64 = 0;
-       info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
-       info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
-       info->group_id = base_gid + sub / block->b->selectors;
-       info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
-       if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
-               info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
-       return 1;
+   struct si_perfcounters *pc = screen->perfcounters;
+   struct si_pc_block *block;
+   unsigned base_gid, sub;
+
+   if (!pc)
+      return 0;
+
+   if (!info) {
+      unsigned bid, num_queries = 0;
+
+      for (bid = 0; bid < pc->num_blocks; ++bid) {
+         num_queries += pc->blocks[bid].b->selectors * pc->blocks[bid].num_groups;
+      }
+
+      return num_queries;
+   }
+
+   block = lookup_counter(pc, index, &base_gid, &sub);
+   if (!block)
+      return 0;
+
+   if (!block->selector_names) {
+      if (!si_init_block_names(screen, block))
+         return 0;
+   }
+   info->name = block->selector_names + sub * block->selector_name_stride;
+   info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
+   info->max_value.u64 = 0;
+   info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
+   info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
+   info->group_id = base_gid + sub / block->b->selectors;
+   info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
+   if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
+      info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
+   return 1;
  }
  
-int si_get_perfcounter_group_info(struct si_screen *screen,
-                                 unsigned index,
-                                 struct pipe_driver_query_group_info *info)
+int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
+                                  struct pipe_driver_query_group_info *info)
  {
-       struct si_perfcounters *pc = screen->perfcounters;
-       struct si_pc_block *block;
-
-       if (!pc)
-               return 0;
-
-       if (!info)
-               return pc->num_groups;
-
-       block = lookup_group(pc, &index);
-       if (!block)
-               return 0;
-
-       if (!block->group_names) {
-               if (!si_init_block_names(screen, block))
-                       return 0;
-       }
-       info->name = block->group_names + index * block->group_name_stride;
-       info->num_queries = block->b->selectors;
-       info->max_active_queries = block->b->b->num_counters;
-       return 1;
+   struct si_perfcounters *pc = screen->perfcounters;
+   struct si_pc_block *block;
+
+   if (!pc)
+      return 0;
+
+   if (!info)
+      return pc->num_groups;
+
+   block = lookup_group(pc, &index);
+   if (!block)
+      return 0;
+
+   if (!block->group_names) {
+      if (!si_init_block_names(screen, block))
+         return 0;
+   }
+   info->name = block->group_names + index * block->group_name_stride;
+   info->num_queries = block->b->selectors;
+   info->max_active_queries = block->b->b->num_counters;
+   return 1;
  }
  
  void si_destroy_perfcounters(struct si_screen *screen)
  {
-       struct si_perfcounters *pc = screen->perfcounters;
-       unsigned i;
-
-       if (!pc)
-               return;
-
-       for (i = 0; i < pc->num_blocks; ++i) {
-               FREE(pc->blocks[i].group_names);
-               FREE(pc->blocks[i].selector_names);
-       }
-       FREE(pc->blocks);
-       FREE(pc);
-       screen->perfcounters = NULL;
+   struct si_perfcounters *pc = screen->perfcounters;
+   unsigned i;
+
+   if (!pc)
+      return;
+
+   for (i = 0; i < pc->num_blocks; ++i) {
+      FREE(pc->blocks[i].group_names);
+      FREE(pc->blocks[i].selector_names);
+   }
+   FREE(pc->blocks);
+   FREE(pc);
+   screen->perfcounters = NULL;
  }
  
  void si_init_perfcounters(struct si_screen *screen)
  {
-       struct si_perfcounters *pc;
-       const struct si_pc_block_gfxdescr *blocks;
-       unsigned num_blocks;
-       unsigned i;
-
-       switch (screen->info.chip_class) {
-       case GFX7:
-               blocks = groups_CIK;
-               num_blocks = ARRAY_SIZE(groups_CIK);
-               break;
-       case GFX8:
-               blocks = groups_VI;
-               num_blocks = ARRAY_SIZE(groups_VI);
-               break;
-       case GFX9:
-               blocks = groups_gfx9;
-               num_blocks = ARRAY_SIZE(groups_gfx9);
-               break;
-       case GFX6:
-       default:
-               return; /* not implemented */
-       }
-
-       if (screen->info.max_sh_per_se != 1) {
-               /* This should not happen on non-GFX6 chips. */
-               fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not "
-                       "supported (inaccurate performance counters)\n",
-                       screen->info.max_sh_per_se);
-       }
-
-       screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
-       if (!pc)
-               return;
-
-       pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
-       pc->num_instance_cs_dwords = 3;
-
-       pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
-       pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
-
-       pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block));
-       if (!pc->blocks)
-               goto error;
-       pc->num_blocks = num_blocks;
-
-       for (i = 0; i < num_blocks; ++i) {
-               struct si_pc_block *block = &pc->blocks[i];
-               block->b = &blocks[i];
-               block->num_instances = MAX2(1, block->b->instances);
-
-               if (!strcmp(block->b->b->name, "CB") ||
-                   !strcmp(block->b->b->name, "DB"))
-                       block->num_instances = screen->info.max_se;
-               else if (!strcmp(block->b->b->name, "TCC"))
-                       block->num_instances = screen->info.num_tcc_blocks;
-               else if (!strcmp(block->b->b->name, "IA"))
-                       block->num_instances = MAX2(1, screen->info.max_se / 2);
-
-               if (si_pc_block_has_per_instance_groups(pc, block)) {
-                       block->num_groups = block->num_instances;
-               } else {
-                       block->num_groups = 1;
-               }
-
-               if (si_pc_block_has_per_se_groups(pc, block))
-                       block->num_groups *= screen->info.max_se;
-               if (block->b->b->flags & SI_PC_BLOCK_SHADER)
-                       block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits);
-
-               pc->num_groups += block->num_groups;
-       }
-
-       return;
+   struct si_perfcounters *pc;
+   const struct si_pc_block_gfxdescr *blocks;
+   unsigned num_blocks;
+   unsigned i;
+
+   switch (screen->info.chip_class) {
+   case GFX7:
+      blocks = groups_CIK;
+      num_blocks = ARRAY_SIZE(groups_CIK);
+      break;
+   case GFX8:
+      blocks = groups_VI;
+      num_blocks = ARRAY_SIZE(groups_VI);
+      break;
+   case GFX9:
+      blocks = groups_gfx9;
+      num_blocks = ARRAY_SIZE(groups_gfx9);
+      break;
+   case GFX6:
+   default:
+      return; /* not implemented */
+   }
+
+   if (screen->info.max_sh_per_se != 1) {
+      /* This should not happen on non-GFX6 chips. */
+      fprintf(stderr,
+              "si_init_perfcounters: max_sh_per_se = %d not "
+              "supported (inaccurate performance counters)\n",
+              screen->info.max_sh_per_se);
+   }
+
+   screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
+   if (!pc)
+      return;
+
+   pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
+   pc->num_instance_cs_dwords = 3;
+
+   pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
+   pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
+
+   pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block));
+   if (!pc->blocks)
+      goto error;
+   pc->num_blocks = num_blocks;
+
+   for (i = 0; i < num_blocks; ++i) {
+      struct si_pc_block *block = &pc->blocks[i];
+      block->b = &blocks[i];
+      block->num_instances = MAX2(1, block->b->instances);
+
+      if (!strcmp(block->b->b->name, "CB") || !strcmp(block->b->b->name, "DB"))
+         block->num_instances = screen->info.max_se;
+      else if (!strcmp(block->b->b->name, "TCC"))
+         block->num_instances = screen->info.num_tcc_blocks;
+      else if (!strcmp(block->b->b->name, "IA"))
+         block->num_instances = MAX2(1, screen->info.max_se / 2);
+
+      if (si_pc_block_has_per_instance_groups(pc, block)) {
+         block->num_groups = block->num_instances;
+      } else {
+         block->num_groups = 1;
+      }
+
+      if (si_pc_block_has_per_se_groups(pc, block))
+         block->num_groups *= screen->info.max_se;
+      if (block->b->b->flags & SI_PC_BLOCK_SHADER)
+         block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits);
+
+      pc->num_groups += block->num_groups;
+   }
+
+   return;
  
  error:
-       si_destroy_perfcounters(screen);
+   si_destroy_perfcounters(screen);
  }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c

index d900467964b3452a4b94a3eac3bc7e27e2ef8248..816015d1f822d2c2673f0155ed4bce81f84e6008 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -24,12 +24,15 @@
   */
  
  #include "si_pipe.h"
+
+#include "driver_ddebug/dd_util.h"
+#include "gallium/winsys/amdgpu/drm/amdgpu_public.h"
+#include "gallium/winsys/radeon/drm/radeon_drm_public.h"
+#include "radeon/radeon_uvd.h"
+#include "si_compute.h"
  #include "si_public.h"
  #include "si_shader_internal.h"
-#include "si_compute.h"
  #include "sid.h"
-
-#include "radeon/radeon_uvd.h"
  #include "util/disk_cache.h"
  #include "util/u_log.h"
  #include "util/u_memory.h"
@@ -38,128 +41,124 @@
  #include "util/u_upload_mgr.h"
  #include "util/xmlconfig.h"
  #include "vl/vl_decoder.h"
-#include "driver_ddebug/dd_util.h"
  
-#include "gallium/winsys/radeon/drm/radeon_drm_public.h"
-#include "gallium/winsys/amdgpu/drm/amdgpu_public.h"
  #include <xf86drm.h>
  
-static struct pipe_context *si_create_context(struct pipe_screen *screen,
-                                              unsigned flags);
+static struct pipe_context *si_create_context(struct pipe_screen *screen, unsigned flags);
  
  static const struct debug_named_value debug_options[] = {
-       /* Shader logging options: */
-       { "vs", DBG(VS), "Print vertex shaders" },
-       { "ps", DBG(PS), "Print pixel shaders" },
-       { "gs", DBG(GS), "Print geometry shaders" },
-       { "tcs", DBG(TCS), "Print tessellation control shaders" },
-       { "tes", DBG(TES), "Print tessellation evaluation shaders" },
-       { "cs", DBG(CS), "Print compute shaders" },
-       { "noir", DBG(NO_IR), "Don't print the LLVM IR"},
-       { "nonir", DBG(NO_NIR), "Don't print NIR when printing shaders"},
-       { "noasm", DBG(NO_ASM), "Don't print disassembled shaders"},
-       { "preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations" },
-
-       /* Shader compiler options the shader cache should be aware of: */
-       { "gisel", DBG(GISEL), "Enable LLVM global instruction selector." },
-       { "w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders." },
-       { "w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders." },
-       { "w32cs", DBG(W32_CS), "Use Wave32 for computes shaders." },
-       { "w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders." },
-       { "w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders." },
-       { "w64cs", DBG(W64_CS), "Use Wave64 for computes shaders." },
-
-       /* Shader compiler options (with no effect on the shader cache): */
-       { "checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR" },
-       { "mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand" },
-       { "nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants." },
-
-       /* Information logging options: */
-       { "info", DBG(INFO), "Print driver information" },
-       { "tex", DBG(TEX), "Print texture info" },
-       { "compute", DBG(COMPUTE), "Print compute info" },
-       { "vm", DBG(VM), "Print virtual addresses when creating resources" },
-       { "cache_stats", DBG(CACHE_STATS), "Print shader cache statistics." },
-
-       /* Driver options: */
-       { "forcedma", DBG(FORCE_SDMA), "Use SDMA for all operations when possible." },
-       { "nodma", DBG(NO_SDMA), "Disable SDMA" },
-       { "nodmaclear", DBG(NO_SDMA_CLEARS), "Disable SDMA clears" },
-       { "nodmacopyimage", DBG(NO_SDMA_COPY_IMAGE), "Disable SDMA image copies" },
-       { "nowc", DBG(NO_WC), "Disable GTT write combining" },
-       { "check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info." },
-       { "reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context." },
-       { "zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations." },
-
-       /* 3D engine options: */
-       { "nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used." },
-       { "nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline." },
-       { "nggc", DBG(ALWAYS_NGG_CULLING), "Always use NGG culling even when it can hurt." },
-       { "nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling." },
-       { "alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader." },
-       { "pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls." },
-       { "nopd", DBG(NO_PD), "Disable the primitive discard compute shader." },
-       { "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." },
-       { "nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization" },
-       { "nodpbb", DBG(NO_DPBB), "Disable DPBB." },
-       { "nodfsm", DBG(NO_DFSM), "Disable DFSM." },
-       { "dpbb", DBG(DPBB), "Enable DPBB." },
-       { "dfsm", DBG(DFSM), "Enable DFSM." },
-       { "nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z" },
-       { "norbplus", DBG(NO_RB_PLUS), "Disable RB+." },
-       { "no2d", DBG(NO_2D_TILING), "Disable 2D tiling" },
-       { "notiling", DBG(NO_TILING), "Disable tiling" },
-       { "nodcc", DBG(NO_DCC), "Disable DCC." },
-       { "nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear." },
-       { "nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer" },
-       { "nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA" },
-       { "nofmask", DBG(NO_FMASK), "Disable MSAA compression" },
-
-       DEBUG_NAMED_VALUE_END /* must be last */
+   /* Shader logging options: */
+   {"vs", DBG(VS), "Print vertex shaders"},
+   {"ps", DBG(PS), "Print pixel shaders"},
+   {"gs", DBG(GS), "Print geometry shaders"},
+   {"tcs", DBG(TCS), "Print tessellation control shaders"},
+   {"tes", DBG(TES), "Print tessellation evaluation shaders"},
+   {"cs", DBG(CS), "Print compute shaders"},
+   {"noir", DBG(NO_IR), "Don't print the LLVM IR"},
+   {"nonir", DBG(NO_NIR), "Don't print NIR when printing shaders"},
+   {"noasm", DBG(NO_ASM), "Don't print disassembled shaders"},
+   {"preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations"},
+
+   /* Shader compiler options the shader cache should be aware of: */
+   {"gisel", DBG(GISEL), "Enable LLVM global instruction selector."},
+   {"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."},
+   {"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."},
+   {"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."},
+   {"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."},
+   {"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."},
+   {"w64cs", DBG(W64_CS), "Use Wave64 for computes shaders."},
+
+   /* Shader compiler options (with no effect on the shader cache): */
+   {"checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR"},
+   {"mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand"},
+   {"nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants."},
+
+   /* Information logging options: */
+   {"info", DBG(INFO), "Print driver information"},
+   {"tex", DBG(TEX), "Print texture info"},
+   {"compute", DBG(COMPUTE), "Print compute info"},
+   {"vm", DBG(VM), "Print virtual addresses when creating resources"},
+   {"cache_stats", DBG(CACHE_STATS), "Print shader cache statistics."},
+
+   /* Driver options: */
+   {"forcedma", DBG(FORCE_SDMA), "Use SDMA for all operations when possible."},
+   {"nodma", DBG(NO_SDMA), "Disable SDMA"},
+   {"nodmaclear", DBG(NO_SDMA_CLEARS), "Disable SDMA clears"},
+   {"nodmacopyimage", DBG(NO_SDMA_COPY_IMAGE), "Disable SDMA image copies"},
+   {"nowc", DBG(NO_WC), "Disable GTT write combining"},
+   {"check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info."},
+   {"reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context."},
+   {"zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations."},
+
+   /* 3D engine options: */
+   {"nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used."},
+   {"nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline."},
+   {"nggc", DBG(ALWAYS_NGG_CULLING), "Always use NGG culling even when it can hurt."},
+   {"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."},
+   {"alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader."},
+   {"pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls."},
+   {"nopd", DBG(NO_PD), "Disable the primitive discard compute shader."},
+   {"switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet."},
+   {"nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization"},
+   {"nodpbb", DBG(NO_DPBB), "Disable DPBB."},
+   {"nodfsm", DBG(NO_DFSM), "Disable DFSM."},
+   {"dpbb", DBG(DPBB), "Enable DPBB."},
+   {"dfsm", DBG(DFSM), "Enable DFSM."},
+   {"nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z"},
+   {"norbplus", DBG(NO_RB_PLUS), "Disable RB+."},
+   {"no2d", DBG(NO_2D_TILING), "Disable 2D tiling"},
+   {"notiling", DBG(NO_TILING), "Disable tiling"},
+   {"nodcc", DBG(NO_DCC), "Disable DCC."},
+   {"nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear."},
+   {"nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer"},
+   {"nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA"},
+   {"nofmask", DBG(NO_FMASK), "Disable MSAA compression"},
+
+   DEBUG_NAMED_VALUE_END /* must be last */
  };
  
  static const struct debug_named_value test_options[] = {
-       /* Tests: */
-       { "testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit." },
-       { "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit." },
-       { "testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit." },
-       { "testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit." },
-       { "testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance" },
-       { "testgds", DBG(TEST_GDS), "Test GDS." },
-       { "testgdsmm", DBG(TEST_GDS_MM), "Test GDS memory management." },
-       { "testgdsoamm", DBG(TEST_GDS_OA_MM), "Test GDS OA memory management." },
-
-       DEBUG_NAMED_VALUE_END /* must be last */
+   /* Tests: */
+   {"testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit."},
+   {"testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit."},
+   {"testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit."},
+   {"testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit."},
+   {"testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance"},
+   {"testgds", DBG(TEST_GDS), "Test GDS."},
+   {"testgdsmm", DBG(TEST_GDS_MM), "Test GDS memory management."},
+   {"testgdsoamm", DBG(TEST_GDS_OA_MM), "Test GDS OA memory management."},
+
+   DEBUG_NAMED_VALUE_END /* must be last */
  };
  
  void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler)
  {
-       /* Only create the less-optimizing version of the compiler on APUs
-        * predating Ryzen (Raven). */
-       bool create_low_opt_compiler = !sscreen->info.has_dedicated_vram &&
-                                      sscreen->info.chip_class <= GFX8;
-
-       enum ac_target_machine_options tm_options =
-               (sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) |
-               (sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) |
-               (sscreen->info.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 0) |
-               (!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) |
-               (sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) |
-               (create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0);
-
-       ac_init_llvm_once();
-       ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options);
-       compiler->passes = ac_create_llvm_passes(compiler->tm);
-
-       if (compiler->tm_wave32)
-               compiler->passes_wave32 = ac_create_llvm_passes(compiler->tm_wave32);
-       if (compiler->low_opt_tm)
-               compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm);
+   /* Only create the less-optimizing version of the compiler on APUs
+    * predating Ryzen (Raven). */
+   bool create_low_opt_compiler =
+      !sscreen->info.has_dedicated_vram && sscreen->info.chip_class <= GFX8;
+
+   enum ac_target_machine_options tm_options =
+      (sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) |
+      (sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) |
+      (sscreen->info.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 0) |
+      (!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) |
+      (sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) |
+      (create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0);
+
+   ac_init_llvm_once();
+   ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options);
+   compiler->passes = ac_create_llvm_passes(compiler->tm);
+
+   if (compiler->tm_wave32)
+      compiler->passes_wave32 = ac_create_llvm_passes(compiler->tm_wave32);
+   if (compiler->low_opt_tm)
+      compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm);
  }
  
  static void si_destroy_compiler(struct ac_llvm_compiler *compiler)
  {
-       ac_destroy_llvm_compiler(compiler);
+   ac_destroy_llvm_compiler(compiler);
  }
  
  /*
@@ -167,195 +166,191 @@ static void si_destroy_compiler(struct ac_llvm_compiler *compiler)
   */
  static void si_destroy_context(struct pipe_context *context)
  {
-       struct si_context *sctx = (struct si_context *)context;
-       int i;
-
-       /* Unreference the framebuffer normally to disable related logic
-        * properly.
-        */
-       struct pipe_framebuffer_state fb = {};
-       if (context->set_framebuffer_state)
-               context->set_framebuffer_state(context, &fb);
-
-       si_release_all_descriptors(sctx);
-
-       if (sctx->chip_class >= GFX10 && sctx->has_graphics)
-               gfx10_destroy_query(sctx);
-
-       pipe_resource_reference(&sctx->esgs_ring, NULL);
-       pipe_resource_reference(&sctx->gsvs_ring, NULL);
-       pipe_resource_reference(&sctx->tess_rings, NULL);
-       pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
-       pipe_resource_reference(&sctx->sample_pos_buffer, NULL);
-       si_resource_reference(&sctx->border_color_buffer, NULL);
-       free(sctx->border_color_table);
-       si_resource_reference(&sctx->scratch_buffer, NULL);
-       si_resource_reference(&sctx->compute_scratch_buffer, NULL);
-       si_resource_reference(&sctx->wait_mem_scratch, NULL);
-       si_resource_reference(&sctx->small_prim_cull_info_buf, NULL);
-
-       si_pm4_free_state(sctx, sctx->init_config, ~0);
-       if (sctx->init_config_gs_rings)
-               si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
-       for (i = 0; i < ARRAY_SIZE(sctx->vgt_shader_config); i++)
-               si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
-
-       if (sctx->fixed_func_tcs_shader.cso)
-               sctx->b.delete_tcs_state(&sctx->b, sctx->fixed_func_tcs_shader.cso);
-       if (sctx->custom_dsa_flush)
-               sctx->b.delete_depth_stencil_alpha_state(&sctx->b, sctx->custom_dsa_flush);
-       if (sctx->custom_blend_resolve)
-               sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_resolve);
-       if (sctx->custom_blend_fmask_decompress)
-               sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_fmask_decompress);
-       if (sctx->custom_blend_eliminate_fastclear)
-               sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_eliminate_fastclear);
-       if (sctx->custom_blend_dcc_decompress)
-               sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_dcc_decompress);
-       if (sctx->vs_blit_pos)
-               sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos);
-       if (sctx->vs_blit_pos_layered)
-               sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos_layered);
-       if (sctx->vs_blit_color)
-               sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color);
-       if (sctx->vs_blit_color_layered)
-               sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered);
-       if (sctx->vs_blit_texcoord)
-               sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
-       if (sctx->cs_clear_buffer)
-               sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
-       if (sctx->cs_copy_buffer)
-               sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
-       if (sctx->cs_copy_image)
-               sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image);
-       if (sctx->cs_copy_image_1d_array)
-               sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1d_array);
-       if (sctx->cs_clear_render_target)
-               sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target);
-       if (sctx->cs_clear_render_target_1d_array)
-               sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array);
-       if (sctx->cs_clear_12bytes_buffer)
-               sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer);
-       if (sctx->cs_dcc_retile)
-               sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile);
-
-       for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_fmask_expand); i++) {
-               for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_fmask_expand[i]); j++) {
-                       if (sctx->cs_fmask_expand[i][j]) {
-                               sctx->b.delete_compute_state(&sctx->b,
-                                                            sctx->cs_fmask_expand[i][j]);
-                       }
-               }
-       }
-
-       if (sctx->blitter)
-               util_blitter_destroy(sctx->blitter);
-
-       /* Release DCC stats. */
-       for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
-               assert(!sctx->dcc_stats[i].query_active);
-
-               for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++)
-                       if (sctx->dcc_stats[i].ps_stats[j])
-                               sctx->b.destroy_query(&sctx->b,
-                                                       sctx->dcc_stats[i].ps_stats[j]);
-
-               si_texture_reference(&sctx->dcc_stats[i].tex, NULL);
-       }
-
-       if (sctx->query_result_shader)
-               sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader);
-       if (sctx->sh_query_result_shader)
-               sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader);
-
-       if (sctx->gfx_cs)
-               sctx->ws->cs_destroy(sctx->gfx_cs);
-       if (sctx->sdma_cs)
-               sctx->ws->cs_destroy(sctx->sdma_cs);
-       if (sctx->ctx)
-               sctx->ws->ctx_destroy(sctx->ctx);
-
-       if (sctx->b.stream_uploader)
-               u_upload_destroy(sctx->b.stream_uploader);
-       if (sctx->b.const_uploader)
-               u_upload_destroy(sctx->b.const_uploader);
-       if (sctx->cached_gtt_allocator)
-               u_upload_destroy(sctx->cached_gtt_allocator);
-
-       slab_destroy_child(&sctx->pool_transfers);
-       slab_destroy_child(&sctx->pool_transfers_unsync);
-
-       if (sctx->allocator_zeroed_memory)
-               u_suballocator_destroy(sctx->allocator_zeroed_memory);
-
-       sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
-       sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL);
-       sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
-       si_resource_reference(&sctx->eop_bug_scratch, NULL);
-       si_resource_reference(&sctx->index_ring, NULL);
-       si_resource_reference(&sctx->barrier_buf, NULL);
-       si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
-       pb_reference(&sctx->gds, NULL);
-       pb_reference(&sctx->gds_oa, NULL);
-
-       si_destroy_compiler(&sctx->compiler);
-
-       si_saved_cs_reference(&sctx->current_saved_cs, NULL);
-
-       _mesa_hash_table_destroy(sctx->tex_handles, NULL);
-       _mesa_hash_table_destroy(sctx->img_handles, NULL);
-
-       util_dynarray_fini(&sctx->resident_tex_handles);
-       util_dynarray_fini(&sctx->resident_img_handles);
-       util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
-       util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
-       util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
-       si_unref_sdma_uploads(sctx);
-       free(sctx->sdma_uploads);
-       FREE(sctx);
+   struct si_context *sctx = (struct si_context *)context;
+   int i;
+
+   /* Unreference the framebuffer normally to disable related logic
+    * properly.
+    */
+   struct pipe_framebuffer_state fb = {};
+   if (context->set_framebuffer_state)
+      context->set_framebuffer_state(context, &fb);
+
+   si_release_all_descriptors(sctx);
+
+   if (sctx->chip_class >= GFX10 && sctx->has_graphics)
+      gfx10_destroy_query(sctx);
+
+   pipe_resource_reference(&sctx->esgs_ring, NULL);
+   pipe_resource_reference(&sctx->gsvs_ring, NULL);
+   pipe_resource_reference(&sctx->tess_rings, NULL);
+   pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
+   pipe_resource_reference(&sctx->sample_pos_buffer, NULL);
+   si_resource_reference(&sctx->border_color_buffer, NULL);
+   free(sctx->border_color_table);
+   si_resource_reference(&sctx->scratch_buffer, NULL);
+   si_resource_reference(&sctx->compute_scratch_buffer, NULL);
+   si_resource_reference(&sctx->wait_mem_scratch, NULL);
+   si_resource_reference(&sctx->small_prim_cull_info_buf, NULL);
+
+   si_pm4_free_state(sctx, sctx->init_config, ~0);
+   if (sctx->init_config_gs_rings)
+      si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
+   for (i = 0; i < ARRAY_SIZE(sctx->vgt_shader_config); i++)
+      si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
+
+   if (sctx->fixed_func_tcs_shader.cso)
+      sctx->b.delete_tcs_state(&sctx->b, sctx->fixed_func_tcs_shader.cso);
+   if (sctx->custom_dsa_flush)
+      sctx->b.delete_depth_stencil_alpha_state(&sctx->b, sctx->custom_dsa_flush);
+   if (sctx->custom_blend_resolve)
+      sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_resolve);
+   if (sctx->custom_blend_fmask_decompress)
+      sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_fmask_decompress);
+   if (sctx->custom_blend_eliminate_fastclear)
+      sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_eliminate_fastclear);
+   if (sctx->custom_blend_dcc_decompress)
+      sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_dcc_decompress);
+   if (sctx->vs_blit_pos)
+      sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos);
+   if (sctx->vs_blit_pos_layered)
+      sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos_layered);
+   if (sctx->vs_blit_color)
+      sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color);
+   if (sctx->vs_blit_color_layered)
+      sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered);
+   if (sctx->vs_blit_texcoord)
+      sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
+   if (sctx->cs_clear_buffer)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
+   if (sctx->cs_copy_buffer)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
+   if (sctx->cs_copy_image)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image);
+   if (sctx->cs_copy_image_1d_array)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1d_array);
+   if (sctx->cs_clear_render_target)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target);
+   if (sctx->cs_clear_render_target_1d_array)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array);
+   if (sctx->cs_clear_12bytes_buffer)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer);
+   if (sctx->cs_dcc_retile)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_fmask_expand); i++) {
+      for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_fmask_expand[i]); j++) {
+         if (sctx->cs_fmask_expand[i][j]) {
+            sctx->b.delete_compute_state(&sctx->b, sctx->cs_fmask_expand[i][j]);
+         }
+      }
+   }
+
+   if (sctx->blitter)
+      util_blitter_destroy(sctx->blitter);
+
+   /* Release DCC stats. */
+   for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
+      assert(!sctx->dcc_stats[i].query_active);
+
+      for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++)
+         if (sctx->dcc_stats[i].ps_stats[j])
+            sctx->b.destroy_query(&sctx->b, sctx->dcc_stats[i].ps_stats[j]);
+
+      si_texture_reference(&sctx->dcc_stats[i].tex, NULL);
+   }
+
+   if (sctx->query_result_shader)
+      sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader);
+   if (sctx->sh_query_result_shader)
+      sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader);
+
+   if (sctx->gfx_cs)
+      sctx->ws->cs_destroy(sctx->gfx_cs);
+   if (sctx->sdma_cs)
+      sctx->ws->cs_destroy(sctx->sdma_cs);
+   if (sctx->ctx)
+      sctx->ws->ctx_destroy(sctx->ctx);
+
+   if (sctx->b.stream_uploader)
+      u_upload_destroy(sctx->b.stream_uploader);
+   if (sctx->b.const_uploader)
+      u_upload_destroy(sctx->b.const_uploader);
+   if (sctx->cached_gtt_allocator)
+      u_upload_destroy(sctx->cached_gtt_allocator);
+
+   slab_destroy_child(&sctx->pool_transfers);
+   slab_destroy_child(&sctx->pool_transfers_unsync);
+
+   if (sctx->allocator_zeroed_memory)
+      u_suballocator_destroy(sctx->allocator_zeroed_memory);
+
+   sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
+   sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL);
+   sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
+   si_resource_reference(&sctx->eop_bug_scratch, NULL);
+   si_resource_reference(&sctx->index_ring, NULL);
+   si_resource_reference(&sctx->barrier_buf, NULL);
+   si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
+   pb_reference(&sctx->gds, NULL);
+   pb_reference(&sctx->gds_oa, NULL);
+
+   si_destroy_compiler(&sctx->compiler);
+
+   si_saved_cs_reference(&sctx->current_saved_cs, NULL);
+
+   _mesa_hash_table_destroy(sctx->tex_handles, NULL);
+   _mesa_hash_table_destroy(sctx->img_handles, NULL);
+
+   util_dynarray_fini(&sctx->resident_tex_handles);
+   util_dynarray_fini(&sctx->resident_img_handles);
+   util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
+   util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
+   util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
+   si_unref_sdma_uploads(sctx);
+   free(sctx->sdma_uploads);
+   FREE(sctx);
  }
  
  static enum pipe_reset_status si_get_reset_status(struct pipe_context *ctx)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_screen *sscreen = sctx->screen;
-       enum pipe_reset_status status = sctx->ws->ctx_query_reset_status(sctx->ctx);
-
-       if (status != PIPE_NO_RESET) {
-               /* Call the state tracker to set a no-op API dispatch. */
-               if (sctx->device_reset_callback.reset) {
-                       sctx->device_reset_callback.reset(sctx->device_reset_callback.data,
-                                                         status);
-               }
-
-               /* Re-create the auxiliary context, because it won't submit
-                * any new IBs due to a GPU reset.
-                */
-               simple_mtx_lock(&sscreen->aux_context_lock);
-
-               struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log;
-               sscreen->aux_context->set_log_context(sscreen->aux_context, NULL);
-               sscreen->aux_context->destroy(sscreen->aux_context);
-
-               sscreen->aux_context = si_create_context(&sscreen->b,
-                       (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
-                       (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY));
-               sscreen->aux_context->set_log_context(sscreen->aux_context, aux_log);
-               simple_mtx_unlock(&sscreen->aux_context_lock);
-       }
-       return status;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_screen *sscreen = sctx->screen;
+   enum pipe_reset_status status = sctx->ws->ctx_query_reset_status(sctx->ctx);
+
+   if (status != PIPE_NO_RESET) {
+      /* Call the state tracker to set a no-op API dispatch. */
+      if (sctx->device_reset_callback.reset) {
+         sctx->device_reset_callback.reset(sctx->device_reset_callback.data, status);
+      }
+
+      /* Re-create the auxiliary context, because it won't submit
+       * any new IBs due to a GPU reset.
+       */
+      simple_mtx_lock(&sscreen->aux_context_lock);
+
+      struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log;
+      sscreen->aux_context->set_log_context(sscreen->aux_context, NULL);
+      sscreen->aux_context->destroy(sscreen->aux_context);
+
+      sscreen->aux_context = si_create_context(
+         &sscreen->b, (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
+                         (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY));
+      sscreen->aux_context->set_log_context(sscreen->aux_context, aux_log);
+      simple_mtx_unlock(&sscreen->aux_context_lock);
+   }
+   return status;
  }
  
  static void si_set_device_reset_callback(struct pipe_context *ctx,
-                                          const struct pipe_device_reset_callback *cb)
+                                         const struct pipe_device_reset_callback *cb)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
  
-       if (cb)
-               sctx->device_reset_callback = *cb;
-       else
-               memset(&sctx->device_reset_callback, 0,
-                      sizeof(sctx->device_reset_callback));
+   if (cb)
+      sctx->device_reset_callback = *cb;
+   else
+      memset(&sctx->device_reset_callback, 0, sizeof(sctx->device_reset_callback));
  }
  
  /* Apitrace profiling:
@@ -366,989 +361,895 @@ static void si_set_device_reset_callback(struct pipe_context *ctx,
   *      call and print the results.
   *   4) glretrace --benchmark --markers ..
   */
-static void si_emit_string_marker(struct pipe_context *ctx,
-                                 const char *string, int len)
+static void si_emit_string_marker(struct pipe_context *ctx, const char *string, int len)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
  
-       dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number);
+   dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number);
  
-       if (sctx->log)
-               u_log_printf(sctx->log, "\nString marker: %*s\n", len, string);
+   if (sctx->log)
+      u_log_printf(sctx->log, "\nString marker: %*s\n", len, string);
  }
  
-static void si_set_debug_callback(struct pipe_context *ctx,
-                                 const struct pipe_debug_callback *cb)
+static void si_set_debug_callback(struct pipe_context *ctx, const struct pipe_debug_callback *cb)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_screen *screen = sctx->screen;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_screen *screen = sctx->screen;
  
-       util_queue_finish(&screen->shader_compiler_queue);
-       util_queue_finish(&screen->shader_compiler_queue_low_priority);
+   util_queue_finish(&screen->shader_compiler_queue);
+   util_queue_finish(&screen->shader_compiler_queue_low_priority);
  
-       if (cb)
-               sctx->debug = *cb;
-       else
-               memset(&sctx->debug, 0, sizeof(sctx->debug));
+   if (cb)
+      sctx->debug = *cb;
+   else
+      memset(&sctx->debug, 0, sizeof(sctx->debug));
  }
  
-static void si_set_log_context(struct pipe_context *ctx,
-                              struct u_log_context *log)
+static void si_set_log_context(struct pipe_context *ctx, struct u_log_context *log)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       sctx->log = log;
+   struct si_context *sctx = (struct si_context *)ctx;
+   sctx->log = log;
  
-       if (log)
-               u_log_add_auto_logger(log, si_auto_log_cs, sctx);
+   if (log)
+      u_log_add_auto_logger(log, si_auto_log_cs, sctx);
  }
  
-static void si_set_context_param(struct pipe_context *ctx,
-                                enum pipe_context_param param,
-                                unsigned value)
+static void si_set_context_param(struct pipe_context *ctx, enum pipe_context_param param,
+                                 unsigned value)
  {
-       struct radeon_winsys *ws = ((struct si_context *)ctx)->ws;
-
-       switch (param) {
-       case PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE:
-               ws->pin_threads_to_L3_cache(ws, value);
-               break;
-       default:;
-       }
+   struct radeon_winsys *ws = ((struct si_context *)ctx)->ws;
+
+   switch (param) {
+   case PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE:
+      ws->pin_threads_to_L3_cache(ws, value);
+      break;
+   default:;
+   }
  }
  
-static struct pipe_context *si_create_context(struct pipe_screen *screen,
-                                              unsigned flags)
+static struct pipe_context *si_create_context(struct pipe_screen *screen, unsigned flags)
  {
-       struct si_screen* sscreen = (struct si_screen *)screen;
-       STATIC_ASSERT(DBG_COUNT <= 64);
-
-       /* Don't create a context if it's not compute-only and hw is compute-only. */
-       if (!sscreen->info.has_graphics &&
-           !(flags & PIPE_CONTEXT_COMPUTE_ONLY))
-               return NULL;
-
-       struct si_context *sctx = CALLOC_STRUCT(si_context);
-       struct radeon_winsys *ws = sscreen->ws;
-       int shader, i;
-       bool stop_exec_on_failure = (flags & PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET) != 0;
-
-       if (!sctx)
-               return NULL;
-
-       sctx->has_graphics = sscreen->info.chip_class == GFX6 ||
-                            !(flags & PIPE_CONTEXT_COMPUTE_ONLY);
-
-       if (flags & PIPE_CONTEXT_DEBUG)
-               sscreen->record_llvm_ir = true; /* racy but not critical */
-
-       sctx->b.screen = screen; /* this must be set first */
-       sctx->b.priv = NULL;
-       sctx->b.destroy = si_destroy_context;
-       sctx->screen = sscreen; /* Easy accessing of screen/winsys. */
-       sctx->is_debug = (flags & PIPE_CONTEXT_DEBUG) != 0;
-
-       slab_create_child(&sctx->pool_transfers, &sscreen->pool_transfers);
-       slab_create_child(&sctx->pool_transfers_unsync, &sscreen->pool_transfers);
-
-       sctx->ws = sscreen->ws;
-       sctx->family = sscreen->info.family;
-       sctx->chip_class = sscreen->info.chip_class;
-
-       if (sctx->chip_class == GFX7 ||
-           sctx->chip_class == GFX8 ||
-           sctx->chip_class == GFX9) {
-               sctx->eop_bug_scratch = si_resource(
-                       pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
-                                          16 * sscreen->info.num_render_backends));
-               if (!sctx->eop_bug_scratch)
-                       goto fail;
-       }
-
-       /* Initialize context allocators. */
-       sctx->allocator_zeroed_memory =
-               u_suballocator_create(&sctx->b, 128 * 1024,
-                                     0, PIPE_USAGE_DEFAULT,
-                                     SI_RESOURCE_FLAG_UNMAPPABLE |
-                                     SI_RESOURCE_FLAG_CLEAR, false);
-       if (!sctx->allocator_zeroed_memory)
-               goto fail;
-
-       sctx->b.stream_uploader = u_upload_create(&sctx->b, 1024 * 1024,
-                                                   0, PIPE_USAGE_STREAM,
-                                                   SI_RESOURCE_FLAG_READ_ONLY);
-       if (!sctx->b.stream_uploader)
-               goto fail;
-
-       sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024,
-                                                      0, PIPE_USAGE_STAGING, 0);
-       if (!sctx->cached_gtt_allocator)
-               goto fail;
-
-       sctx->ctx = sctx->ws->ctx_create(sctx->ws);
-       if (!sctx->ctx)
-               goto fail;
-
-       if (sscreen->info.num_rings[RING_DMA] &&
-           !(sscreen->debug_flags & DBG(NO_SDMA)) &&
-           /* SDMA causes corruption on RX 580:
-            *    https://gitlab.freedesktop.org/mesa/mesa/issues/1399
-            *    https://gitlab.freedesktop.org/mesa/mesa/issues/1889
-            */
-           (sctx->chip_class != GFX8 || sscreen->debug_flags & DBG(FORCE_SDMA)) &&
-           /* SDMA timeouts sometimes on gfx10 so disable it for now. See:
-            *    https://bugs.freedesktop.org/show_bug.cgi?id=111481
-            *    https://gitlab.freedesktop.org/mesa/mesa/issues/1907
-            */
-           (sctx->chip_class != GFX10 || sscreen->debug_flags & DBG(FORCE_SDMA))) {
-               sctx->sdma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA,
-                                                  (void*)si_flush_dma_cs,
-                                                  sctx, stop_exec_on_failure);
-       }
-
-       bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->sdma_cs;
-       sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024,
-                                                0, PIPE_USAGE_DEFAULT,
-                                                SI_RESOURCE_FLAG_32BIT |
-                                                (use_sdma_upload ?
-                                                         SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : 0));
-       if (!sctx->b.const_uploader)
-               goto fail;
-
-       if (use_sdma_upload)
-               u_upload_enable_flush_explicit(sctx->b.const_uploader);
-
-       sctx->gfx_cs = ws->cs_create(sctx->ctx,
-                                    sctx->has_graphics ? RING_GFX : RING_COMPUTE,
-                                    (void*)si_flush_gfx_cs, sctx, stop_exec_on_failure);
-
-       /* Border colors. */
-       sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS *
-                                         sizeof(*sctx->border_color_table));
-       if (!sctx->border_color_table)
-               goto fail;
-
-       sctx->border_color_buffer = si_resource(
-               pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT,
-                                  SI_MAX_BORDER_COLORS *
-                                  sizeof(*sctx->border_color_table)));
-       if (!sctx->border_color_buffer)
-               goto fail;
-
-       sctx->border_color_map =
-               ws->buffer_map(sctx->border_color_buffer->buf,
-                              NULL, PIPE_TRANSFER_WRITE);
-       if (!sctx->border_color_map)
-               goto fail;
-
-       sctx->ngg = sscreen->use_ngg;
-
-       /* Initialize context functions used by graphics and compute. */
-       if (sctx->chip_class >= GFX10)
-               sctx->emit_cache_flush = gfx10_emit_cache_flush;
-       else
-               sctx->emit_cache_flush = si_emit_cache_flush;
-
-       sctx->b.emit_string_marker = si_emit_string_marker;
-       sctx->b.set_debug_callback = si_set_debug_callback;
-       sctx->b.set_log_context = si_set_log_context;
-       sctx->b.set_context_param = si_set_context_param;
-       sctx->b.get_device_reset_status = si_get_reset_status;
-       sctx->b.set_device_reset_callback = si_set_device_reset_callback;
-
-       si_init_all_descriptors(sctx);
-       si_init_buffer_functions(sctx);
-       si_init_clear_functions(sctx);
-       si_init_blit_functions(sctx);
-       si_init_compute_functions(sctx);
-       si_init_compute_blit_functions(sctx);
-       si_init_debug_functions(sctx);
-       si_init_fence_functions(sctx);
-       si_init_query_functions(sctx);
-       si_init_state_compute_functions(sctx);
-       si_init_context_texture_functions(sctx);
-
-       /* Initialize graphics-only context functions. */
-       if (sctx->has_graphics) {
-               if (sctx->chip_class >= GFX10)
-                       gfx10_init_query(sctx);
-               si_init_msaa_functions(sctx);
-               si_init_shader_functions(sctx);
-               si_init_state_functions(sctx);
-               si_init_streamout_functions(sctx);
-               si_init_viewport_functions(sctx);
-
-               sctx->blitter = util_blitter_create(&sctx->b);
-               if (sctx->blitter == NULL)
-                       goto fail;
-               sctx->blitter->skip_viewport_restore = true;
-
-               /* Some states are expected to be always non-NULL. */
-               sctx->noop_blend = util_blitter_get_noop_blend_state(sctx->blitter);
-               sctx->queued.named.blend = sctx->noop_blend;
-
-               sctx->noop_dsa = util_blitter_get_noop_dsa_state(sctx->blitter);
-               sctx->queued.named.dsa = sctx->noop_dsa;
-
-               sctx->discard_rasterizer_state =
-                       util_blitter_get_discard_rasterizer_state(sctx->blitter);
-               sctx->queued.named.rasterizer = sctx->discard_rasterizer_state;
-
-               si_init_draw_functions(sctx);
-
-               /* If aux_context == NULL, we are initializing aux_context right now. */
-               bool is_aux_context = !sscreen->aux_context;
-               si_initialize_prim_discard_tunables(sscreen, is_aux_context,
-                                                   &sctx->prim_discard_vertex_count_threshold,
-                                                   &sctx->index_ring_size_per_ib);
-       }
-
-       /* Initialize SDMA functions. */
-       if (sctx->chip_class >= GFX7)
-               cik_init_sdma_functions(sctx);
-       else
-               sctx->dma_copy = si_resource_copy_region;
-
-       if (sscreen->debug_flags & DBG(FORCE_SDMA))
-               sctx->b.resource_copy_region = sctx->dma_copy;
-
-       sctx->sample_mask = 0xffff;
-
-       /* Initialize multimedia functions. */
-       if (sscreen->info.has_hw_decode) {
-               sctx->b.create_video_codec = si_uvd_create_decoder;
-               sctx->b.create_video_buffer = si_video_buffer_create;
-       } else {
-               sctx->b.create_video_codec = vl_create_decoder;
-               sctx->b.create_video_buffer = vl_video_buffer_create;
-       }
-
-       if (sctx->chip_class >= GFX9 ||
-           si_compute_prim_discard_enabled(sctx)) {
-               sctx->wait_mem_scratch = si_resource(
-                       pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 8));
-               if (!sctx->wait_mem_scratch)
-                       goto fail;
-
-               /* Initialize the memory. */
-               si_cp_write_data(sctx, sctx->wait_mem_scratch, 0, 4,
-                                V_370_MEM, V_370_ME, &sctx->wait_mem_number);
-       }
-
-       /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD doesn't skip loads
-        * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */
-       if (sctx->chip_class == GFX7) {
-               sctx->null_const_buf.buffer =
-                       pipe_aligned_buffer_create(screen,
-                                                  SI_RESOURCE_FLAG_32BIT,
-                                                  PIPE_USAGE_DEFAULT, 16,
-                                                  sctx->screen->info.tcc_cache_line_size);
-               if (!sctx->null_const_buf.buffer)
-                       goto fail;
-               sctx->null_const_buf.buffer_size = sctx->null_const_buf.buffer->width0;
-
-               unsigned start_shader = sctx->has_graphics ? 0 :  PIPE_SHADER_COMPUTE;
-               for (shader = start_shader; shader < SI_NUM_SHADERS; shader++) {
-                       for (i = 0; i < SI_NUM_CONST_BUFFERS; i++) {
-                               sctx->b.set_constant_buffer(&sctx->b, shader, i,
-                                                             &sctx->null_const_buf);
-                       }
-               }
-
-               si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS,
-                                &sctx->null_const_buf);
-               si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS,
-                                &sctx->null_const_buf);
-               si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES,
-                                &sctx->null_const_buf);
-               si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE,
-                                &sctx->null_const_buf);
-               si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS,
-                                &sctx->null_const_buf);
-       }
-
-       uint64_t max_threads_per_block;
-       screen->get_compute_param(screen, PIPE_SHADER_IR_NIR,
-                                 PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK,
-                                 &max_threads_per_block);
-
-       /* The maximum number of scratch waves. Scratch space isn't divided
-        * evenly between CUs. The number is only a function of the number of CUs.
-        * We can decrease the constant to decrease the scratch buffer size.
-        *
-        * sctx->scratch_waves must be >= the maximum posible size of
-        * 1 threadgroup, so that the hw doesn't hang from being unable
-        * to start any.
-        *
-        * The recommended value is 4 per CU at most. Higher numbers don't
-        * bring much benefit, but they still occupy chip resources (think
-        * async compute). I've seen ~2% performance difference between 4 and 32.
-        */
-       sctx->scratch_waves = MAX2(32 * sscreen->info.num_good_compute_units,
-                                  max_threads_per_block / 64);
-
-       /* Bindless handles. */
-       sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
-                                                   _mesa_key_pointer_equal);
-       sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
-                                                   _mesa_key_pointer_equal);
-
-       util_dynarray_init(&sctx->resident_tex_handles, NULL);
-       util_dynarray_init(&sctx->resident_img_handles, NULL);
-       util_dynarray_init(&sctx->resident_tex_needs_color_decompress, NULL);
-       util_dynarray_init(&sctx->resident_img_needs_color_decompress, NULL);
-       util_dynarray_init(&sctx->resident_tex_needs_depth_decompress, NULL);
-
-       sctx->sample_pos_buffer =
-               pipe_buffer_create(sctx->b.screen, 0, PIPE_USAGE_DEFAULT,
-                                  sizeof(sctx->sample_positions));
-       pipe_buffer_write(&sctx->b, sctx->sample_pos_buffer, 0,
-                         sizeof(sctx->sample_positions), &sctx->sample_positions);
-
-       /* this must be last */
-       si_begin_new_gfx_cs(sctx);
-
-       if (sctx->chip_class == GFX7) {
-               /* Clear the NULL constant buffer, because loads should return zeros.
-                * Note that this forces CP DMA to be used, because clover deadlocks
-                * for some reason when the compute codepath is used.
-                */
-               uint32_t clear_value = 0;
-               si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
-                               sctx->null_const_buf.buffer->width0,
-                               &clear_value, 4, SI_COHERENCY_SHADER, true);
-       }
-       return &sctx->b;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   STATIC_ASSERT(DBG_COUNT <= 64);
+
+   /* Don't create a context if it's not compute-only and hw is compute-only. */
+   if (!sscreen->info.has_graphics && !(flags & PIPE_CONTEXT_COMPUTE_ONLY))
+      return NULL;
+
+   struct si_context *sctx = CALLOC_STRUCT(si_context);
+   struct radeon_winsys *ws = sscreen->ws;
+   int shader, i;
+   bool stop_exec_on_failure = (flags & PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET) != 0;
+
+   if (!sctx)
+      return NULL;
+
+   sctx->has_graphics = sscreen->info.chip_class == GFX6 || !(flags & PIPE_CONTEXT_COMPUTE_ONLY);
+
+   if (flags & PIPE_CONTEXT_DEBUG)
+      sscreen->record_llvm_ir = true; /* racy but not critical */
+
+   sctx->b.screen = screen; /* this must be set first */
+   sctx->b.priv = NULL;
+   sctx->b.destroy = si_destroy_context;
+   sctx->screen = sscreen; /* Easy accessing of screen/winsys. */
+   sctx->is_debug = (flags & PIPE_CONTEXT_DEBUG) != 0;
+
+   slab_create_child(&sctx->pool_transfers, &sscreen->pool_transfers);
+   slab_create_child(&sctx->pool_transfers_unsync, &sscreen->pool_transfers);
+
+   sctx->ws = sscreen->ws;
+   sctx->family = sscreen->info.family;
+   sctx->chip_class = sscreen->info.chip_class;
+
+   if (sctx->chip_class == GFX7 || sctx->chip_class == GFX8 || sctx->chip_class == GFX9) {
+      sctx->eop_bug_scratch = si_resource(pipe_buffer_create(
+         &sscreen->b, 0, PIPE_USAGE_DEFAULT, 16 * sscreen->info.num_render_backends));
+      if (!sctx->eop_bug_scratch)
+         goto fail;
+   }
+
+   /* Initialize context allocators. */
+   sctx->allocator_zeroed_memory =
+      u_suballocator_create(&sctx->b, 128 * 1024, 0, PIPE_USAGE_DEFAULT,
+                            SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_CLEAR, false);
+   if (!sctx->allocator_zeroed_memory)
+      goto fail;
+
+   sctx->b.stream_uploader =
+      u_upload_create(&sctx->b, 1024 * 1024, 0, PIPE_USAGE_STREAM, SI_RESOURCE_FLAG_READ_ONLY);
+   if (!sctx->b.stream_uploader)
+      goto fail;
+
+   sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024, 0, PIPE_USAGE_STAGING, 0);
+   if (!sctx->cached_gtt_allocator)
+      goto fail;
+
+   sctx->ctx = sctx->ws->ctx_create(sctx->ws);
+   if (!sctx->ctx)
+      goto fail;
+
+   if (sscreen->info.num_rings[RING_DMA] && !(sscreen->debug_flags & DBG(NO_SDMA)) &&
+       /* SDMA causes corruption on RX 580:
+        *    https://gitlab.freedesktop.org/mesa/mesa/issues/1399
+        *    https://gitlab.freedesktop.org/mesa/mesa/issues/1889
+        */
+       (sctx->chip_class != GFX8 || sscreen->debug_flags & DBG(FORCE_SDMA)) &&
+       /* SDMA timeouts sometimes on gfx10 so disable it for now. See:
+        *    https://bugs.freedesktop.org/show_bug.cgi?id=111481
+        *    https://gitlab.freedesktop.org/mesa/mesa/issues/1907
+        */
+       (sctx->chip_class != GFX10 || sscreen->debug_flags & DBG(FORCE_SDMA))) {
+      sctx->sdma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA, (void *)si_flush_dma_cs, sctx,
+                                          stop_exec_on_failure);
+   }
+
+   bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->sdma_cs;
+   sctx->b.const_uploader =
+      u_upload_create(&sctx->b, 256 * 1024, 0, PIPE_USAGE_DEFAULT,
+                      SI_RESOURCE_FLAG_32BIT |
+                         (use_sdma_upload ? SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : 0));
+   if (!sctx->b.const_uploader)
+      goto fail;
+
+   if (use_sdma_upload)
+      u_upload_enable_flush_explicit(sctx->b.const_uploader);
+
+   sctx->gfx_cs = ws->cs_create(sctx->ctx, sctx->has_graphics ? RING_GFX : RING_COMPUTE,
+                                (void *)si_flush_gfx_cs, sctx, stop_exec_on_failure);
+
+   /* Border colors. */
+   sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS * sizeof(*sctx->border_color_table));
+   if (!sctx->border_color_table)
+      goto fail;
+
+   sctx->border_color_buffer = si_resource(pipe_buffer_create(
+      screen, 0, PIPE_USAGE_DEFAULT, SI_MAX_BORDER_COLORS * sizeof(*sctx->border_color_table)));
+   if (!sctx->border_color_buffer)
+      goto fail;
+
+   sctx->border_color_map =
+      ws->buffer_map(sctx->border_color_buffer->buf, NULL, PIPE_TRANSFER_WRITE);
+   if (!sctx->border_color_map)
+      goto fail;
+
+   sctx->ngg = sscreen->use_ngg;
+
+   /* Initialize context functions used by graphics and compute. */
+   if (sctx->chip_class >= GFX10)
+      sctx->emit_cache_flush = gfx10_emit_cache_flush;
+   else
+      sctx->emit_cache_flush = si_emit_cache_flush;
+
+   sctx->b.emit_string_marker = si_emit_string_marker;
+   sctx->b.set_debug_callback = si_set_debug_callback;
+   sctx->b.set_log_context = si_set_log_context;
+   sctx->b.set_context_param = si_set_context_param;
+   sctx->b.get_device_reset_status = si_get_reset_status;
+   sctx->b.set_device_reset_callback = si_set_device_reset_callback;
+
+   si_init_all_descriptors(sctx);
+   si_init_buffer_functions(sctx);
+   si_init_clear_functions(sctx);
+   si_init_blit_functions(sctx);
+   si_init_compute_functions(sctx);
+   si_init_compute_blit_functions(sctx);
+   si_init_debug_functions(sctx);
+   si_init_fence_functions(sctx);
+   si_init_query_functions(sctx);
+   si_init_state_compute_functions(sctx);
+   si_init_context_texture_functions(sctx);
+
+   /* Initialize graphics-only context functions. */
+   if (sctx->has_graphics) {
+      if (sctx->chip_class >= GFX10)
+         gfx10_init_query(sctx);
+      si_init_msaa_functions(sctx);
+      si_init_shader_functions(sctx);
+      si_init_state_functions(sctx);
+      si_init_streamout_functions(sctx);
+      si_init_viewport_functions(sctx);
+
+      sctx->blitter = util_blitter_create(&sctx->b);
+      if (sctx->blitter == NULL)
+         goto fail;
+      sctx->blitter->skip_viewport_restore = true;
+
+      /* Some states are expected to be always non-NULL. */
+      sctx->noop_blend = util_blitter_get_noop_blend_state(sctx->blitter);
+      sctx->queued.named.blend = sctx->noop_blend;
+
+      sctx->noop_dsa = util_blitter_get_noop_dsa_state(sctx->blitter);
+      sctx->queued.named.dsa = sctx->noop_dsa;
+
+      sctx->discard_rasterizer_state = util_blitter_get_discard_rasterizer_state(sctx->blitter);
+      sctx->queued.named.rasterizer = sctx->discard_rasterizer_state;
+
+      si_init_draw_functions(sctx);
+
+      /* If aux_context == NULL, we are initializing aux_context right now. */
+      bool is_aux_context = !sscreen->aux_context;
+      si_initialize_prim_discard_tunables(sscreen, is_aux_context,
+                                          &sctx->prim_discard_vertex_count_threshold,
+                                          &sctx->index_ring_size_per_ib);
+   }
+
+   /* Initialize SDMA functions. */
+   if (sctx->chip_class >= GFX7)
+      cik_init_sdma_functions(sctx);
+   else
+      sctx->dma_copy = si_resource_copy_region;
+
+   if (sscreen->debug_flags & DBG(FORCE_SDMA))
+      sctx->b.resource_copy_region = sctx->dma_copy;
+
+   sctx->sample_mask = 0xffff;
+
+   /* Initialize multimedia functions. */
+   if (sscreen->info.has_hw_decode) {
+      sctx->b.create_video_codec = si_uvd_create_decoder;
+      sctx->b.create_video_buffer = si_video_buffer_create;
+   } else {
+      sctx->b.create_video_codec = vl_create_decoder;
+      sctx->b.create_video_buffer = vl_video_buffer_create;
+   }
+
+   if (sctx->chip_class >= GFX9 || si_compute_prim_discard_enabled(sctx)) {
+      sctx->wait_mem_scratch = si_resource(pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 8));
+      if (!sctx->wait_mem_scratch)
+         goto fail;
+
+      /* Initialize the memory. */
+      si_cp_write_data(sctx, sctx->wait_mem_scratch, 0, 4, V_370_MEM, V_370_ME,
+                       &sctx->wait_mem_number);
+   }
+
+   /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD doesn't skip loads
+    * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */
+   if (sctx->chip_class == GFX7) {
+      sctx->null_const_buf.buffer =
+         pipe_aligned_buffer_create(screen, SI_RESOURCE_FLAG_32BIT, PIPE_USAGE_DEFAULT, 16,
+                                    sctx->screen->info.tcc_cache_line_size);
+      if (!sctx->null_const_buf.buffer)
+         goto fail;
+      sctx->null_const_buf.buffer_size = sctx->null_const_buf.buffer->width0;
+
+      unsigned start_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE;
+      for (shader = start_shader; shader < SI_NUM_SHADERS; shader++) {
+         for (i = 0; i < SI_NUM_CONST_BUFFERS; i++) {
+            sctx->b.set_constant_buffer(&sctx->b, shader, i, &sctx->null_const_buf);
+         }
+      }
+
+      si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &sctx->null_const_buf);
+      si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &sctx->null_const_buf);
+      si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &sctx->null_const_buf);
+      si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &sctx->null_const_buf);
+      si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &sctx->null_const_buf);
+   }
+
+   uint64_t max_threads_per_block;
+   screen->get_compute_param(screen, PIPE_SHADER_IR_NIR, PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK,
+                             &max_threads_per_block);
+
+   /* The maximum number of scratch waves. Scratch space isn't divided
+    * evenly between CUs. The number is only a function of the number of CUs.
+    * We can decrease the constant to decrease the scratch buffer size.
+    *
+    * sctx->scratch_waves must be >= the maximum posible size of
+    * 1 threadgroup, so that the hw doesn't hang from being unable
+    * to start any.
+    *
+    * The recommended value is 4 per CU at most. Higher numbers don't
+    * bring much benefit, but they still occupy chip resources (think
+    * async compute). I've seen ~2% performance difference between 4 and 32.
+    */
+   sctx->scratch_waves =
+      MAX2(32 * sscreen->info.num_good_compute_units, max_threads_per_block / 64);
+
+   /* Bindless handles. */
+   sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+   sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+
+   util_dynarray_init(&sctx->resident_tex_handles, NULL);
+   util_dynarray_init(&sctx->resident_img_handles, NULL);
+   util_dynarray_init(&sctx->resident_tex_needs_color_decompress, NULL);
+   util_dynarray_init(&sctx->resident_img_needs_color_decompress, NULL);
+   util_dynarray_init(&sctx->resident_tex_needs_depth_decompress, NULL);
+
+   sctx->sample_pos_buffer =
+      pipe_buffer_create(sctx->b.screen, 0, PIPE_USAGE_DEFAULT, sizeof(sctx->sample_positions));
+   pipe_buffer_write(&sctx->b, sctx->sample_pos_buffer, 0, sizeof(sctx->sample_positions),
+                     &sctx->sample_positions);
+
+   /* this must be last */
+   si_begin_new_gfx_cs(sctx);
+
+   if (sctx->chip_class == GFX7) {
+      /* Clear the NULL constant buffer, because loads should return zeros.
+       * Note that this forces CP DMA to be used, because clover deadlocks
+       * for some reason when the compute codepath is used.
+       */
+      uint32_t clear_value = 0;
+      si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0, sctx->null_const_buf.buffer->width0,
+                      &clear_value, 4, SI_COHERENCY_SHADER, true);
+   }
+   return &sctx->b;
  fail:
-       fprintf(stderr, "radeonsi: Failed to create a context.\n");
-       si_destroy_context(&sctx->b);
-       return NULL;
+   fprintf(stderr, "radeonsi: Failed to create a context.\n");
+   si_destroy_context(&sctx->b);
+   return NULL;
  }
  
-static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen,
-                                                  void *priv, unsigned flags)
+static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, void *priv,
+                                                   unsigned flags)
  {
-       struct si_screen *sscreen = (struct si_screen *)screen;
-       struct pipe_context *ctx;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct pipe_context *ctx;
  
-       if (sscreen->debug_flags & DBG(CHECK_VM))
-               flags |= PIPE_CONTEXT_DEBUG;
+   if (sscreen->debug_flags & DBG(CHECK_VM))
+      flags |= PIPE_CONTEXT_DEBUG;
  
-       ctx = si_create_context(screen, flags);
+   ctx = si_create_context(screen, flags);
  
-       if (!(flags & PIPE_CONTEXT_PREFER_THREADED))
-               return ctx;
+   if (!(flags & PIPE_CONTEXT_PREFER_THREADED))
+      return ctx;
  
-       /* Clover (compute-only) is unsupported. */
-       if (flags & PIPE_CONTEXT_COMPUTE_ONLY)
-               return ctx;
+   /* Clover (compute-only) is unsupported. */
+   if (flags & PIPE_CONTEXT_COMPUTE_ONLY)
+      return ctx;
  
-       /* When shaders are logged to stderr, asynchronous compilation is
-        * disabled too. */
-       if (sscreen->debug_flags & DBG_ALL_SHADERS)
-               return ctx;
+   /* When shaders are logged to stderr, asynchronous compilation is
+    * disabled too. */
+   if (sscreen->debug_flags & DBG_ALL_SHADERS)
+      return ctx;
  
-       /* Use asynchronous flushes only on amdgpu, since the radeon
-        * implementation for fence_server_sync is incomplete. */
-       return threaded_context_create(ctx, &sscreen->pool_transfers,
-                                      si_replace_buffer_storage,
-                                      sscreen->info.is_amdgpu ? si_create_fence : NULL,
-                                      &((struct si_context*)ctx)->tc);
+   /* Use asynchronous flushes only on amdgpu, since the radeon
+    * implementation for fence_server_sync is incomplete. */
+   return threaded_context_create(ctx, &sscreen->pool_transfers, si_replace_buffer_storage,
+                                  sscreen->info.is_amdgpu ? si_create_fence : NULL,
+                                  &((struct si_context *)ctx)->tc);
  }
  
  /*
   * pipe_screen
   */
-static void si_destroy_screen(struct pipe_screen* pscreen)
+static void si_destroy_screen(struct pipe_screen *pscreen)
  {
-       struct si_screen *sscreen = (struct si_screen *)pscreen;
-       struct si_shader_part *parts[] = {
-               sscreen->vs_prologs,
-               sscreen->tcs_epilogs,
-               sscreen->gs_prologs,
-               sscreen->ps_prologs,
-               sscreen->ps_epilogs
-       };
-       unsigned i;
-
-       if (!sscreen->ws->unref(sscreen->ws))
-               return;
-
-       if (sscreen->debug_flags & DBG(CACHE_STATS)) {
-               printf("live shader cache:   hits = %u, misses = %u\n",
-                      sscreen->live_shader_cache.hits,
-                      sscreen->live_shader_cache.misses);
-               printf("memory shader cache: hits = %u, misses = %u\n",
-                      sscreen->num_memory_shader_cache_hits,
-                      sscreen->num_memory_shader_cache_misses);
-               printf("disk shader cache:   hits = %u, misses = %u\n",
-                      sscreen->num_disk_shader_cache_hits,
-                      sscreen->num_disk_shader_cache_misses);
-       }
-
-       simple_mtx_destroy(&sscreen->aux_context_lock);
-
-       struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log;
-       if (aux_log) {
-               sscreen->aux_context->set_log_context(sscreen->aux_context, NULL);
-               u_log_context_destroy(aux_log);
-               FREE(aux_log);
-       }
-
-       sscreen->aux_context->destroy(sscreen->aux_context);
-
-       util_queue_destroy(&sscreen->shader_compiler_queue);
-       util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
-
-       /* Release the reference on glsl types of the compiler threads. */
-       glsl_type_singleton_decref();
-
-       for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++)
-               si_destroy_compiler(&sscreen->compiler[i]);
-
-       for (i = 0; i < ARRAY_SIZE(sscreen->compiler_lowp); i++)
-               si_destroy_compiler(&sscreen->compiler_lowp[i]);
-
-       /* Free shader parts. */
-       for (i = 0; i < ARRAY_SIZE(parts); i++) {
-               while (parts[i]) {
-                       struct si_shader_part *part = parts[i];
-
-                       parts[i] = part->next;
-                       si_shader_binary_clean(&part->binary);
-                       FREE(part);
-               }
-       }
-       simple_mtx_destroy(&sscreen->shader_parts_mutex);
-       si_destroy_shader_cache(sscreen);
-
-       si_destroy_perfcounters(sscreen);
-       si_gpu_load_kill_thread(sscreen);
-
-       simple_mtx_destroy(&sscreen->gpu_load_mutex);
-
-       slab_destroy_parent(&sscreen->pool_transfers);
-
-       disk_cache_destroy(sscreen->disk_shader_cache);
-       util_live_shader_cache_deinit(&sscreen->live_shader_cache);
-       sscreen->ws->destroy(sscreen->ws);
-       FREE(sscreen);
+   struct si_screen *sscreen = (struct si_screen *)pscreen;
+   struct si_shader_part *parts[] = {sscreen->vs_prologs, sscreen->tcs_epilogs, sscreen->gs_prologs,
+                                     sscreen->ps_prologs, sscreen->ps_epilogs};
+   unsigned i;
+
+   if (!sscreen->ws->unref(sscreen->ws))
+      return;
+
+   if (sscreen->debug_flags & DBG(CACHE_STATS)) {
+      printf("live shader cache:   hits = %u, misses = %u\n", sscreen->live_shader_cache.hits,
+             sscreen->live_shader_cache.misses);
+      printf("memory shader cache: hits = %u, misses = %u\n", sscreen->num_memory_shader_cache_hits,
+             sscreen->num_memory_shader_cache_misses);
+      printf("disk shader cache:   hits = %u, misses = %u\n", sscreen->num_disk_shader_cache_hits,
+             sscreen->num_disk_shader_cache_misses);
+   }
+
+   simple_mtx_destroy(&sscreen->aux_context_lock);
+
+   struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log;
+   if (aux_log) {
+      sscreen->aux_context->set_log_context(sscreen->aux_context, NULL);
+      u_log_context_destroy(aux_log);
+      FREE(aux_log);
+   }
+
+   sscreen->aux_context->destroy(sscreen->aux_context);
+
+   util_queue_destroy(&sscreen->shader_compiler_queue);
+   util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
+
+   /* Release the reference on glsl types of the compiler threads. */
+   glsl_type_singleton_decref();
+
+   for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++)
+      si_destroy_compiler(&sscreen->compiler[i]);
+
+   for (i = 0; i < ARRAY_SIZE(sscreen->compiler_lowp); i++)
+      si_destroy_compiler(&sscreen->compiler_lowp[i]);
+
+   /* Free shader parts. */
+   for (i = 0; i < ARRAY_SIZE(parts); i++) {
+      while (parts[i]) {
+         struct si_shader_part *part = parts[i];
+
+         parts[i] = part->next;
+         si_shader_binary_clean(&part->binary);
+         FREE(part);
+      }
+   }
+   simple_mtx_destroy(&sscreen->shader_parts_mutex);
+   si_destroy_shader_cache(sscreen);
+
+   si_destroy_perfcounters(sscreen);
+   si_gpu_load_kill_thread(sscreen);
+
+   simple_mtx_destroy(&sscreen->gpu_load_mutex);
+
+   slab_destroy_parent(&sscreen->pool_transfers);
+
+   disk_cache_destroy(sscreen->disk_shader_cache);
+   util_live_shader_cache_deinit(&sscreen->live_shader_cache);
+   sscreen->ws->destroy(sscreen->ws);
+   FREE(sscreen);
  }
  
  static void si_init_gs_info(struct si_screen *sscreen)
  {
-       sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.chip_class,
-                                                       sscreen->info.family);
+   sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.chip_class, sscreen->info.family);
  }
  
  static void si_test_vmfault(struct si_screen *sscreen, uint64_t test_flags)
  {
-       struct pipe_context *ctx = sscreen->aux_context;
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct pipe_resource *buf =
-               pipe_buffer_create_const0(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 64);
-
-       if (!buf) {
-               puts("Buffer allocation failed.");
-               exit(1);
-       }
-
-       si_resource(buf)->gpu_address = 0; /* cause a VM fault */
-
-       if (test_flags & DBG(TEST_VMFAULT_CP)) {
-               si_cp_dma_copy_buffer(sctx, buf, buf, 0, 4, 4, 0,
-                                     SI_COHERENCY_NONE, L2_BYPASS);
-               ctx->flush(ctx, NULL, 0);
-               puts("VM fault test: CP - done.");
-       }
-       if (test_flags & DBG(TEST_VMFAULT_SDMA)) {
-               si_sdma_clear_buffer(sctx, buf, 0, 4, 0);
-               ctx->flush(ctx, NULL, 0);
-               puts("VM fault test: SDMA - done.");
-       }
-       if (test_flags & DBG(TEST_VMFAULT_SHADER)) {
-               util_test_constant_buffer(ctx, buf);
-               puts("VM fault test: Shader - done.");
-       }
-       exit(0);
+   struct pipe_context *ctx = sscreen->aux_context;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_resource *buf = pipe_buffer_create_const0(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 64);
+
+   if (!buf) {
+      puts("Buffer allocation failed.");
+      exit(1);
+   }
+
+   si_resource(buf)->gpu_address = 0; /* cause a VM fault */
+
+   if (test_flags & DBG(TEST_VMFAULT_CP)) {
+      si_cp_dma_copy_buffer(sctx, buf, buf, 0, 4, 4, 0, SI_COHERENCY_NONE, L2_BYPASS);
+      ctx->flush(ctx, NULL, 0);
+      puts("VM fault test: CP - done.");
+   }
+   if (test_flags & DBG(TEST_VMFAULT_SDMA)) {
+      si_sdma_clear_buffer(sctx, buf, 0, 4, 0);
+      ctx->flush(ctx, NULL, 0);
+      puts("VM fault test: SDMA - done.");
+   }
+   if (test_flags & DBG(TEST_VMFAULT_SHADER)) {
+      util_test_constant_buffer(ctx, buf);
+      puts("VM fault test: Shader - done.");
+   }
+   exit(0);
  }
  
-static void si_test_gds_memory_management(struct si_context *sctx,
-                                         unsigned alloc_size, unsigned alignment,
-                                         enum radeon_bo_domain domain)
+static void si_test_gds_memory_management(struct si_context *sctx, unsigned alloc_size,
+                                          unsigned alignment, enum radeon_bo_domain domain)
  {
-       struct radeon_winsys *ws = sctx->ws;
-       struct radeon_cmdbuf *cs[8];
-       struct pb_buffer *gds_bo[ARRAY_SIZE(cs)];
-
-       for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) {
-               cs[i] = ws->cs_create(sctx->ctx, RING_COMPUTE,
-                                     NULL, NULL, false);
-               gds_bo[i] = ws->buffer_create(ws, alloc_size, alignment, domain, 0);
-               assert(gds_bo[i]);
-       }
-
-       for (unsigned iterations = 0; iterations < 20000; iterations++) {
-               for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) {
-                       /* This clears GDS with CP DMA.
-                        *
-                        * We don't care if GDS is present. Just add some packet
-                        * to make the GPU busy for a moment.
-                        */
-                       si_cp_dma_clear_buffer(sctx, cs[i], NULL, 0, alloc_size, 0,
-                                              SI_CPDMA_SKIP_BO_LIST_UPDATE |
-                                              SI_CPDMA_SKIP_CHECK_CS_SPACE |
-                                              SI_CPDMA_SKIP_GFX_SYNC, 0, 0);
-
-                       ws->cs_add_buffer(cs[i], gds_bo[i], domain,
-                                         RADEON_USAGE_READWRITE, 0);
-                       ws->cs_flush(cs[i], PIPE_FLUSH_ASYNC, NULL);
-               }
-       }
-       exit(0);
+   struct radeon_winsys *ws = sctx->ws;
+   struct radeon_cmdbuf *cs[8];
+   struct pb_buffer *gds_bo[ARRAY_SIZE(cs)];
+
+   for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) {
+      cs[i] = ws->cs_create(sctx->ctx, RING_COMPUTE, NULL, NULL, false);
+      gds_bo[i] = ws->buffer_create(ws, alloc_size, alignment, domain, 0);
+      assert(gds_bo[i]);
+   }
+
+   for (unsigned iterations = 0; iterations < 20000; iterations++) {
+      for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) {
+         /* This clears GDS with CP DMA.
+          *
+          * We don't care if GDS is present. Just add some packet
+          * to make the GPU busy for a moment.
+          */
+         si_cp_dma_clear_buffer(
+            sctx, cs[i], NULL, 0, alloc_size, 0,
+            SI_CPDMA_SKIP_BO_LIST_UPDATE | SI_CPDMA_SKIP_CHECK_CS_SPACE | SI_CPDMA_SKIP_GFX_SYNC, 0,
+            0);
+
+         ws->cs_add_buffer(cs[i], gds_bo[i], domain, RADEON_USAGE_READWRITE, 0);
+         ws->cs_flush(cs[i], PIPE_FLUSH_ASYNC, NULL);
+      }
+   }
+   exit(0);
  }
  
  static void si_disk_cache_create(struct si_screen *sscreen)
  {
-       /* Don't use the cache if shader dumping is enabled. */
-       if (sscreen->debug_flags & DBG_ALL_SHADERS)
-               return;
-
-       struct mesa_sha1 ctx;
-       unsigned char sha1[20];
-       char cache_id[20 * 2 + 1];
-
-       _mesa_sha1_init(&ctx);
-
-       if (!disk_cache_get_function_identifier(si_disk_cache_create, &ctx) ||
-           !disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo,
-                                               &ctx))
-               return;
-
-       _mesa_sha1_final(&ctx, sha1);
-       disk_cache_format_hex_id(cache_id, sha1, 20 * 2);
-
-       /* These flags affect shader compilation. */
-       #define ALL_FLAGS (DBG(GISEL))
-       uint64_t shader_debug_flags = sscreen->debug_flags & ALL_FLAGS;
-
-       /* Add the high bits of 32-bit addresses, which affects
-        * how 32-bit addresses are expanded to 64 bits.
-        */
-       STATIC_ASSERT(ALL_FLAGS <= UINT_MAX);
-       assert((int16_t)sscreen->info.address32_hi == (int32_t)sscreen->info.address32_hi);
-       shader_debug_flags |= (uint64_t)(sscreen->info.address32_hi & 0xffff) << 32;
-
-       sscreen->disk_shader_cache =
-               disk_cache_create(sscreen->info.name,
-                                 cache_id,
-                                 shader_debug_flags);
+   /* Don't use the cache if shader dumping is enabled. */
+   if (sscreen->debug_flags & DBG_ALL_SHADERS)
+      return;
+
+   struct mesa_sha1 ctx;
+   unsigned char sha1[20];
+   char cache_id[20 * 2 + 1];
+
+   _mesa_sha1_init(&ctx);
+
+   if (!disk_cache_get_function_identifier(si_disk_cache_create, &ctx) ||
+       !disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo, &ctx))
+      return;
+
+   _mesa_sha1_final(&ctx, sha1);
+   disk_cache_format_hex_id(cache_id, sha1, 20 * 2);
+
+/* These flags affect shader compilation. */
+#define ALL_FLAGS (DBG(GISEL))
+   uint64_t shader_debug_flags = sscreen->debug_flags & ALL_FLAGS;
+
+   /* Add the high bits of 32-bit addresses, which affects
+    * how 32-bit addresses are expanded to 64 bits.
+    */
+   STATIC_ASSERT(ALL_FLAGS <= UINT_MAX);
+   assert((int16_t)sscreen->info.address32_hi == (int32_t)sscreen->info.address32_hi);
+   shader_debug_flags |= (uint64_t)(sscreen->info.address32_hi & 0xffff) << 32;
+
+   sscreen->disk_shader_cache = disk_cache_create(sscreen->info.name, cache_id, shader_debug_flags);
  }
  
-static void si_set_max_shader_compiler_threads(struct pipe_screen *screen,
-                                              unsigned max_threads)
+static void si_set_max_shader_compiler_threads(struct pipe_screen *screen, unsigned max_threads)
  {
-       struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_screen *sscreen = (struct si_screen *)screen;
  
-       /* This function doesn't allow a greater number of threads than
-        * the queue had at its creation. */
-       util_queue_adjust_num_threads(&sscreen->shader_compiler_queue,
-                                     max_threads);
-       /* Don't change the number of threads on the low priority queue. */
+   /* This function doesn't allow a greater number of threads than
+    * the queue had at its creation. */
+   util_queue_adjust_num_threads(&sscreen->shader_compiler_queue, max_threads);
+   /* Don't change the number of threads on the low priority queue. */
  }
  
-static bool si_is_parallel_shader_compilation_finished(struct pipe_screen *screen,
-                                                      void *shader,
-                                                      enum pipe_shader_type shader_type)
+static bool si_is_parallel_shader_compilation_finished(struct pipe_screen *screen, void *shader,
+                                                       enum pipe_shader_type shader_type)
  {
-       struct si_shader_selector *sel = (struct si_shader_selector *)shader;
+   struct si_shader_selector *sel = (struct si_shader_selector *)shader;
  
-       return util_queue_fence_is_signalled(&sel->ready);
+   return util_queue_fence_is_signalled(&sel->ready);
  }
  
-static struct pipe_screen *
-radeonsi_screen_create_impl(struct radeon_winsys *ws,
-                           const struct pipe_screen_config *config)
+static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
+                                                       const struct pipe_screen_config *config)
  {
-       struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
-       unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads;
-       uint64_t test_flags;
-
-       if (!sscreen) {
-               return NULL;
-       }
-
-       sscreen->ws = ws;
-       ws->query_info(ws, &sscreen->info);
-
-       if (sscreen->info.chip_class == GFX10 && LLVM_VERSION_MAJOR < 9) {
-               fprintf(stderr, "radeonsi: Navi family support requires LLVM 9 or higher\n");
-               FREE(sscreen);
-               return NULL;
-       }
-
-       if (sscreen->info.chip_class >= GFX9) {
-               sscreen->se_tile_repeat = 32 * sscreen->info.max_se;
-       } else {
-               ac_get_raster_config(&sscreen->info,
-                                    &sscreen->pa_sc_raster_config,
-                                    &sscreen->pa_sc_raster_config_1,
-                                    &sscreen->se_tile_repeat);
-       }
-
-       sscreen->debug_flags = debug_get_flags_option("R600_DEBUG",
-                                                     debug_options, 0);
-       sscreen->debug_flags |= debug_get_flags_option("AMD_DEBUG",
-                                                      debug_options, 0);
-       test_flags = debug_get_flags_option("AMD_TEST",
-                                           test_options, 0);
-
-       if (sscreen->debug_flags & DBG(NO_GFX))
-               sscreen->info.has_graphics = false;
-
-       /* Set functions first. */
-       sscreen->b.context_create = si_pipe_create_context;
-       sscreen->b.destroy = si_destroy_screen;
-       sscreen->b.set_max_shader_compiler_threads =
-               si_set_max_shader_compiler_threads;
-       sscreen->b.is_parallel_shader_compilation_finished =
-               si_is_parallel_shader_compilation_finished;
-       sscreen->b.finalize_nir = si_finalize_nir;
-
-       si_init_screen_get_functions(sscreen);
-       si_init_screen_buffer_functions(sscreen);
-       si_init_screen_fence_functions(sscreen);
-       si_init_screen_state_functions(sscreen);
-       si_init_screen_texture_functions(sscreen);
-       si_init_screen_query_functions(sscreen);
-       si_init_screen_live_shader_cache(sscreen);
-
-       /* Set these flags in debug_flags early, so that the shader cache takes
-        * them into account.
-        */
-       if (driQueryOptionb(config->options,
-                           "glsl_correct_derivatives_after_discard"))
-               sscreen->debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL);
-
-       if (sscreen->debug_flags & DBG(INFO))
-               ac_print_gpu_info(&sscreen->info);
-
-       slab_create_parent(&sscreen->pool_transfers,
-                          sizeof(struct si_transfer), 64);
-
-       sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
-       if (sscreen->force_aniso == -1) {
-               sscreen->force_aniso = MIN2(16, debug_get_num_option("AMD_TEX_ANISO", -1));
-       }
-
-       if (sscreen->force_aniso >= 0) {
-               printf("radeonsi: Forcing anisotropy filter to %ix\n",
-                      /* round down to a power of two */
-                      1 << util_logbase2(sscreen->force_aniso));
-       }
-
-       (void) simple_mtx_init(&sscreen->aux_context_lock, mtx_plain);
-       (void) simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
-
-       si_init_gs_info(sscreen);
-       if (!si_init_shader_cache(sscreen)) {
-               FREE(sscreen);
-               return NULL;
-       }
-
-       {
-#define OPT_BOOL(name, dflt, description) \
-               sscreen->options.name = \
-                       driQueryOptionb(config->options, "radeonsi_"#name);
+   struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
+   unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads;
+   uint64_t test_flags;
+
+   if (!sscreen) {
+      return NULL;
+   }
+
+   sscreen->ws = ws;
+   ws->query_info(ws, &sscreen->info);
+
+   if (sscreen->info.chip_class == GFX10 && LLVM_VERSION_MAJOR < 9) {
+      fprintf(stderr, "radeonsi: Navi family support requires LLVM 9 or higher\n");
+      FREE(sscreen);
+      return NULL;
+   }
+
+   if (sscreen->info.chip_class >= GFX9) {
+      sscreen->se_tile_repeat = 32 * sscreen->info.max_se;
+   } else {
+      ac_get_raster_config(&sscreen->info, &sscreen->pa_sc_raster_config,
+                           &sscreen->pa_sc_raster_config_1, &sscreen->se_tile_repeat);
+   }
+
+   sscreen->debug_flags = debug_get_flags_option("R600_DEBUG", debug_options, 0);
+   sscreen->debug_flags |= debug_get_flags_option("AMD_DEBUG", debug_options, 0);
+   test_flags = debug_get_flags_option("AMD_TEST", test_options, 0);
+
+   if (sscreen->debug_flags & DBG(NO_GFX))
+      sscreen->info.has_graphics = false;
+
+   /* Set functions first. */
+   sscreen->b.context_create = si_pipe_create_context;
+   sscreen->b.destroy = si_destroy_screen;
+   sscreen->b.set_max_shader_compiler_threads = si_set_max_shader_compiler_threads;
+   sscreen->b.is_parallel_shader_compilation_finished = si_is_parallel_shader_compilation_finished;
+   sscreen->b.finalize_nir = si_finalize_nir;
+
+   si_init_screen_get_functions(sscreen);
+   si_init_screen_buffer_functions(sscreen);
+   si_init_screen_fence_functions(sscreen);
+   si_init_screen_state_functions(sscreen);
+   si_init_screen_texture_functions(sscreen);
+   si_init_screen_query_functions(sscreen);
+   si_init_screen_live_shader_cache(sscreen);
+
+   /* Set these flags in debug_flags early, so that the shader cache takes
+    * them into account.
+    */
+   if (driQueryOptionb(config->options, "glsl_correct_derivatives_after_discard"))
+      sscreen->debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL);
+
+   if (sscreen->debug_flags & DBG(INFO))
+      ac_print_gpu_info(&sscreen->info);
+
+   slab_create_parent(&sscreen->pool_transfers, sizeof(struct si_transfer), 64);
+
+   sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
+   if (sscreen->force_aniso == -1) {
+      sscreen->force_aniso = MIN2(16, debug_get_num_option("AMD_TEX_ANISO", -1));
+   }
+
+   if (sscreen->force_aniso >= 0) {
+      printf("radeonsi: Forcing anisotropy filter to %ix\n",
+             /* round down to a power of two */
+             1 << util_logbase2(sscreen->force_aniso));
+   }
+
+   (void)simple_mtx_init(&sscreen->aux_context_lock, mtx_plain);
+   (void)simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
+
+   si_init_gs_info(sscreen);
+   if (!si_init_shader_cache(sscreen)) {
+      FREE(sscreen);
+      return NULL;
+   }
+
+   {
+#define OPT_BOOL(name, dflt, description)                                                          \
+   sscreen->options.name = driQueryOptionb(config->options, "radeonsi_" #name);
  #include "si_debug_options.h"
-       }
-
-       si_disk_cache_create(sscreen);
-
-       /* Determine the number of shader compiler threads. */
-       hw_threads = sysconf(_SC_NPROCESSORS_ONLN);
-
-       if (hw_threads >= 12) {
-               num_comp_hi_threads = hw_threads * 3 / 4;
-               num_comp_lo_threads = hw_threads / 3;
-       } else if (hw_threads >= 6) {
-               num_comp_hi_threads = hw_threads - 2;
-               num_comp_lo_threads = hw_threads / 2;
-       } else if (hw_threads >= 2) {
-               num_comp_hi_threads = hw_threads - 1;
-               num_comp_lo_threads = hw_threads / 2;
-       } else {
-               num_comp_hi_threads = 1;
-               num_comp_lo_threads = 1;
-       }
-
-       num_comp_hi_threads = MIN2(num_comp_hi_threads,
-                                  ARRAY_SIZE(sscreen->compiler));
-       num_comp_lo_threads = MIN2(num_comp_lo_threads,
-                                  ARRAY_SIZE(sscreen->compiler_lowp));
-
-       /* Take a reference on the glsl types for the compiler threads. */
-       glsl_type_singleton_init_or_ref();
-
-       if (!util_queue_init(&sscreen->shader_compiler_queue, "sh",
-                            64, num_comp_hi_threads,
-                            UTIL_QUEUE_INIT_RESIZE_IF_FULL |
-                            UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) {
-               si_destroy_shader_cache(sscreen);
-               FREE(sscreen);
-               glsl_type_singleton_decref();
-               return NULL;
-       }
-
-       if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority,
-                            "shlo",
-                            64, num_comp_lo_threads,
-                            UTIL_QUEUE_INIT_RESIZE_IF_FULL |
-                            UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY |
-                            UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
-              si_destroy_shader_cache(sscreen);
-              FREE(sscreen);
-              glsl_type_singleton_decref();
-              return NULL;
-       }
-
-       if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
-               si_init_perfcounters(sscreen);
-
-       unsigned prim_discard_vertex_count_threshold, tmp;
-       si_initialize_prim_discard_tunables(sscreen, false,
-                                           &prim_discard_vertex_count_threshold,
-                                           &tmp);
-       /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */
-       if (prim_discard_vertex_count_threshold == UINT_MAX)
-               sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
-
-       /* Determine tessellation ring info. */
-       bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&
-                                     sscreen->info.family != CHIP_CARRIZO &&
-                                     sscreen->info.family != CHIP_STONEY;
-       /* This must be one less than the maximum number due to a hw limitation.
-        * Various hardware bugs need this.
-        */
-       unsigned max_offchip_buffers_per_se;
-
-       if (sscreen->info.chip_class >= GFX10)
-               max_offchip_buffers_per_se = 256;
-       /* Only certain chips can use the maximum value. */
-       else if (sscreen->info.family == CHIP_VEGA12 ||
-                sscreen->info.family == CHIP_VEGA20)
-               max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
-       else
-               max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63;
-
-       unsigned max_offchip_buffers = max_offchip_buffers_per_se *
-                                      sscreen->info.max_se;
-       unsigned offchip_granularity;
-
-       /* Hawaii has a bug with offchip buffers > 256 that can be worked
-        * around by setting 4K granularity.
-        */
-       if (sscreen->info.family == CHIP_HAWAII) {
-               sscreen->tess_offchip_block_dw_size = 4096;
-               offchip_granularity = V_03093C_X_4K_DWORDS;
-       } else {
-               sscreen->tess_offchip_block_dw_size = 8192;
-               offchip_granularity = V_03093C_X_8K_DWORDS;
-       }
-
-       sscreen->tess_factor_ring_size = 32768 * sscreen->info.max_se;
-       sscreen->tess_offchip_ring_size = max_offchip_buffers *
-                                         sscreen->tess_offchip_block_dw_size * 4;
-
-       if (sscreen->info.chip_class >= GFX7) {
-               if (sscreen->info.chip_class >= GFX8)
-                       --max_offchip_buffers;
-               sscreen->vgt_hs_offchip_param =
-                       S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) |
-                       S_03093C_OFFCHIP_GRANULARITY(offchip_granularity);
-       } else {
-               assert(offchip_granularity == V_03093C_X_8K_DWORDS);
-               sscreen->vgt_hs_offchip_param =
-                       S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers);
-       }
-
-       sscreen->has_draw_indirect_multi =
-               (sscreen->info.family >= CHIP_POLARIS10) ||
-               (sscreen->info.chip_class == GFX8 &&
-                sscreen->info.pfp_fw_version >= 121 &&
-                sscreen->info.me_fw_version >= 87) ||
-               (sscreen->info.chip_class == GFX7 &&
-                sscreen->info.pfp_fw_version >= 211 &&
-                sscreen->info.me_fw_version >= 173) ||
-               (sscreen->info.chip_class == GFX6 &&
-                sscreen->info.pfp_fw_version >= 79 &&
-                sscreen->info.me_fw_version >= 142);
-
-       sscreen->has_out_of_order_rast = sscreen->info.has_out_of_order_rast &&
-                                        !(sscreen->debug_flags & DBG(NO_OUT_OF_ORDER));
-       sscreen->assume_no_z_fights =
-               driQueryOptionb(config->options, "radeonsi_assume_no_z_fights") ||
-               driQueryOptionb(config->options, "allow_draw_out_of_order");
-       sscreen->commutative_blend_add =
-               driQueryOptionb(config->options, "radeonsi_commutative_blend_add") ||
-               driQueryOptionb(config->options, "allow_draw_out_of_order");
-
-       sscreen->use_ngg = sscreen->info.chip_class >= GFX10 &&
-                          sscreen->info.family != CHIP_NAVI14 &&
-                          !(sscreen->debug_flags & DBG(NO_NGG));
-       sscreen->use_ngg_culling = sscreen->use_ngg &&
-                                  !(sscreen->debug_flags & DBG(NO_NGG_CULLING));
-       sscreen->always_use_ngg_culling = sscreen->use_ngg_culling &&
-                                         sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING);
-       sscreen->use_ngg_streamout = false;
-
-       /* Only enable primitive binning on APUs by default. */
-       if (sscreen->info.chip_class >= GFX10) {
-               sscreen->dpbb_allowed = true;
-               sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram;
-       } else if (sscreen->info.chip_class == GFX9) {
-               sscreen->dpbb_allowed = !sscreen->info.has_dedicated_vram;
-               sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram;
-       }
-
-       /* Process DPBB enable flags. */
-       if (sscreen->debug_flags & DBG(DPBB)) {
-               sscreen->dpbb_allowed = true;
-               if (sscreen->debug_flags & DBG(DFSM))
-                       sscreen->dfsm_allowed = true;
-       }
-
-       /* Process DPBB disable flags. */
-       if (sscreen->debug_flags & DBG(NO_DPBB)) {
-               sscreen->dpbb_allowed = false;
-               sscreen->dfsm_allowed = false;
-       } else if (sscreen->debug_flags & DBG(NO_DFSM)) {
-               sscreen->dfsm_allowed = false;
-       }
-
-       /* While it would be nice not to have this flag, we are constrained
-        * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9.
-        */
-       sscreen->llvm_has_working_vgpr_indexing = sscreen->info.chip_class != GFX9;
-
-       sscreen->dcc_msaa_allowed =
-               !(sscreen->debug_flags & DBG(NO_DCC_MSAA));
-
-       (void) simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain);
-       sscreen->use_monolithic_shaders =
-               (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
-
-       sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE |
-                                         SI_CONTEXT_INV_VCACHE;
-       if (sscreen->info.chip_class <= GFX8) {
-               sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2;
-               sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2;
-       }
-
-       if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
-               sscreen->debug_flags |= DBG_ALL_SHADERS;
-
-       /* Syntax:
-        *     EQAA=s,z,c
-        * Example:
-        *     EQAA=8,4,2
-
-        * That means 8 coverage samples, 4 Z/S samples, and 2 color samples.
-        * Constraints:
-        *     s >= z >= c (ignoring this only wastes memory)
-        *     s = [2..16]
-        *     z = [2..8]
-        *     c = [2..8]
-        *
-        * Only MSAA color and depth buffers are overriden.
-        */
-       if (sscreen->info.has_eqaa_surface_allocator) {
-               const char *eqaa = debug_get_option("EQAA", NULL);
-               unsigned s,z,f;
-
-               if (eqaa && sscanf(eqaa, "%u,%u,%u", &s, &z, &f) == 3 && s && z && f) {
-                       sscreen->eqaa_force_coverage_samples = s;
-                       sscreen->eqaa_force_z_samples = z;
-                       sscreen->eqaa_force_color_samples = f;
-               }
-       }
-
-       sscreen->ge_wave_size = 64;
-       sscreen->ps_wave_size = 64;
-       sscreen->compute_wave_size = 64;
-
-       if (sscreen->info.chip_class >= GFX10) {
-               /* Pixels shaders: Wave64 is recommended.
-                * Compute shaders: There are piglit failures with Wave32.
-                */
-               sscreen->ge_wave_size = 32;
-
-               if (sscreen->debug_flags & DBG(W32_GE))
-                       sscreen->ge_wave_size = 32;
-               if (sscreen->debug_flags & DBG(W32_PS))
-                       sscreen->ps_wave_size = 32;
-               if (sscreen->debug_flags & DBG(W32_CS))
-                       sscreen->compute_wave_size = 32;
-
-               if (sscreen->debug_flags & DBG(W64_GE))
-                       sscreen->ge_wave_size = 64;
-               if (sscreen->debug_flags & DBG(W64_PS))
-                       sscreen->ps_wave_size = 64;
-               if (sscreen->debug_flags & DBG(W64_CS))
-                       sscreen->compute_wave_size = 64;
-       }
-
-       /* Create the auxiliary context. This must be done last. */
-       sscreen->aux_context = si_create_context(&sscreen->b,
-               (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
-               (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY));
-       if (sscreen->options.aux_debug) {
-               struct u_log_context *log = CALLOC_STRUCT(u_log_context);
-               u_log_context_init(log);
-               sscreen->aux_context->set_log_context(sscreen->aux_context, log);
-       }
-
-       if (test_flags & DBG(TEST_DMA))
-               si_test_dma(sscreen);
-
-       if (test_flags & DBG(TEST_DMA_PERF)) {
-               si_test_dma_perf(sscreen);
-       }
-
-       if (test_flags & (DBG(TEST_VMFAULT_CP) |
-                                     DBG(TEST_VMFAULT_SDMA) |
-                                     DBG(TEST_VMFAULT_SHADER)))
-               si_test_vmfault(sscreen, test_flags);
-
-       if (test_flags & DBG(TEST_GDS))
-               si_test_gds((struct si_context*)sscreen->aux_context);
-
-       if (test_flags & DBG(TEST_GDS_MM)) {
-               si_test_gds_memory_management((struct si_context*)sscreen->aux_context,
-                                             32 * 1024, 4, RADEON_DOMAIN_GDS);
-       }
-       if (test_flags & DBG(TEST_GDS_OA_MM)) {
-               si_test_gds_memory_management((struct si_context*)sscreen->aux_context,
-                                             4, 1, RADEON_DOMAIN_OA);
-       }
-
-       STATIC_ASSERT(sizeof(union si_vgt_stages_key) == 4);
-       return &sscreen->b;
+   }
+
+   si_disk_cache_create(sscreen);
+
+   /* Determine the number of shader compiler threads. */
+   hw_threads = sysconf(_SC_NPROCESSORS_ONLN);
+
+   if (hw_threads >= 12) {
+      num_comp_hi_threads = hw_threads * 3 / 4;
+      num_comp_lo_threads = hw_threads / 3;
+   } else if (hw_threads >= 6) {
+      num_comp_hi_threads = hw_threads - 2;
+      num_comp_lo_threads = hw_threads / 2;
+   } else if (hw_threads >= 2) {
+      num_comp_hi_threads = hw_threads - 1;
+      num_comp_lo_threads = hw_threads / 2;
+   } else {
+      num_comp_hi_threads = 1;
+      num_comp_lo_threads = 1;
+   }
+
+   num_comp_hi_threads = MIN2(num_comp_hi_threads, ARRAY_SIZE(sscreen->compiler));
+   num_comp_lo_threads = MIN2(num_comp_lo_threads, ARRAY_SIZE(sscreen->compiler_lowp));
+
+   /* Take a reference on the glsl types for the compiler threads. */
+   glsl_type_singleton_init_or_ref();
+
+   if (!util_queue_init(
+          &sscreen->shader_compiler_queue, "sh", 64, num_comp_hi_threads,
+          UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) {
+      si_destroy_shader_cache(sscreen);
+      FREE(sscreen);
+      glsl_type_singleton_decref();
+      return NULL;
+   }
+
+   if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority, "shlo", 64,
+                        num_comp_lo_threads,
+                        UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY |
+                           UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
+      si_destroy_shader_cache(sscreen);
+      FREE(sscreen);
+      glsl_type_singleton_decref();
+      return NULL;
+   }
+
+   if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
+      si_init_perfcounters(sscreen);
+
+   unsigned prim_discard_vertex_count_threshold, tmp;
+   si_initialize_prim_discard_tunables(sscreen, false, &prim_discard_vertex_count_threshold, &tmp);
+   /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */
+   if (prim_discard_vertex_count_threshold == UINT_MAX)
+      sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
+
+   /* Determine tessellation ring info. */
+   bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&
+                                 sscreen->info.family != CHIP_CARRIZO &&
+                                 sscreen->info.family != CHIP_STONEY;
+   /* This must be one less than the maximum number due to a hw limitation.
+    * Various hardware bugs need this.
+    */
+   unsigned max_offchip_buffers_per_se;
+
+   if (sscreen->info.chip_class >= GFX10)
+      max_offchip_buffers_per_se = 256;
+   /* Only certain chips can use the maximum value. */
+   else if (sscreen->info.family == CHIP_VEGA12 || sscreen->info.family == CHIP_VEGA20)
+      max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
+   else
+      max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63;
+
+   unsigned max_offchip_buffers = max_offchip_buffers_per_se * sscreen->info.max_se;
+   unsigned offchip_granularity;
+
+   /* Hawaii has a bug with offchip buffers > 256 that can be worked
+    * around by setting 4K granularity.
+    */
+   if (sscreen->info.family == CHIP_HAWAII) {
+      sscreen->tess_offchip_block_dw_size = 4096;
+      offchip_granularity = V_03093C_X_4K_DWORDS;
+   } else {
+      sscreen->tess_offchip_block_dw_size = 8192;
+      offchip_granularity = V_03093C_X_8K_DWORDS;
+   }
+
+   sscreen->tess_factor_ring_size = 32768 * sscreen->info.max_se;
+   sscreen->tess_offchip_ring_size = max_offchip_buffers * sscreen->tess_offchip_block_dw_size * 4;
+
+   if (sscreen->info.chip_class >= GFX7) {
+      if (sscreen->info.chip_class >= GFX8)
+         --max_offchip_buffers;
+      sscreen->vgt_hs_offchip_param = S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) |
+                                      S_03093C_OFFCHIP_GRANULARITY(offchip_granularity);
+   } else {
+      assert(offchip_granularity == V_03093C_X_8K_DWORDS);
+      sscreen->vgt_hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers);
+   }
+
+   sscreen->has_draw_indirect_multi =
+      (sscreen->info.family >= CHIP_POLARIS10) ||
+      (sscreen->info.chip_class == GFX8 && sscreen->info.pfp_fw_version >= 121 &&
+       sscreen->info.me_fw_version >= 87) ||
+      (sscreen->info.chip_class == GFX7 && sscreen->info.pfp_fw_version >= 211 &&
+       sscreen->info.me_fw_version >= 173) ||
+      (sscreen->info.chip_class == GFX6 && sscreen->info.pfp_fw_version >= 79 &&
+       sscreen->info.me_fw_version >= 142);
+
+   sscreen->has_out_of_order_rast =
+      sscreen->info.has_out_of_order_rast && !(sscreen->debug_flags & DBG(NO_OUT_OF_ORDER));
+   sscreen->assume_no_z_fights = driQueryOptionb(config->options, "radeonsi_assume_no_z_fights") ||
+                                 driQueryOptionb(config->options, "allow_draw_out_of_order");
+   sscreen->commutative_blend_add =
+      driQueryOptionb(config->options, "radeonsi_commutative_blend_add") ||
+      driQueryOptionb(config->options, "allow_draw_out_of_order");
+
+   sscreen->use_ngg = sscreen->info.chip_class >= GFX10 && sscreen->info.family != CHIP_NAVI14 &&
+                      !(sscreen->debug_flags & DBG(NO_NGG));
+   sscreen->use_ngg_culling = sscreen->use_ngg && !(sscreen->debug_flags & DBG(NO_NGG_CULLING));
+   sscreen->always_use_ngg_culling =
+      sscreen->use_ngg_culling && sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING);
+   sscreen->use_ngg_streamout = false;
+
+   /* Only enable primitive binning on APUs by default. */
+   if (sscreen->info.chip_class >= GFX10) {
+      sscreen->dpbb_allowed = true;
+      sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram;
+   } else if (sscreen->info.chip_class == GFX9) {
+      sscreen->dpbb_allowed = !sscreen->info.has_dedicated_vram;
+      sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram;
+   }
+
+   /* Process DPBB enable flags. */
+   if (sscreen->debug_flags & DBG(DPBB)) {
+      sscreen->dpbb_allowed = true;
+      if (sscreen->debug_flags & DBG(DFSM))
+         sscreen->dfsm_allowed = true;
+   }
+
+   /* Process DPBB disable flags. */
+   if (sscreen->debug_flags & DBG(NO_DPBB)) {
+      sscreen->dpbb_allowed = false;
+      sscreen->dfsm_allowed = false;
+   } else if (sscreen->debug_flags & DBG(NO_DFSM)) {
+      sscreen->dfsm_allowed = false;
+   }
+
+   /* While it would be nice not to have this flag, we are constrained
+    * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9.
+    */
+   sscreen->llvm_has_working_vgpr_indexing = sscreen->info.chip_class != GFX9;
+
+   sscreen->dcc_msaa_allowed = !(sscreen->debug_flags & DBG(NO_DCC_MSAA));
+
+   (void)simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain);
+   sscreen->use_monolithic_shaders = (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
+
+   sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
+   if (sscreen->info.chip_class <= GFX8) {
+      sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2;
+      sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2;
+   }
+
+   if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
+      sscreen->debug_flags |= DBG_ALL_SHADERS;
+
+   /* Syntax:
+    *     EQAA=s,z,c
+    * Example:
+    *     EQAA=8,4,2
+
+    * That means 8 coverage samples, 4 Z/S samples, and 2 color samples.
+    * Constraints:
+    *     s >= z >= c (ignoring this only wastes memory)
+    *     s = [2..16]
+    *     z = [2..8]
+    *     c = [2..8]
+    *
+    * Only MSAA color and depth buffers are overriden.
+    */
+   if (sscreen->info.has_eqaa_surface_allocator) {
+      const char *eqaa = debug_get_option("EQAA", NULL);
+      unsigned s, z, f;
+
+      if (eqaa && sscanf(eqaa, "%u,%u,%u", &s, &z, &f) == 3 && s && z && f) {
+         sscreen->eqaa_force_coverage_samples = s;
+         sscreen->eqaa_force_z_samples = z;
+         sscreen->eqaa_force_color_samples = f;
+      }
+   }
+
+   sscreen->ge_wave_size = 64;
+   sscreen->ps_wave_size = 64;
+   sscreen->compute_wave_size = 64;
+
+   if (sscreen->info.chip_class >= GFX10) {
+      /* Pixels shaders: Wave64 is recommended.
+       * Compute shaders: There are piglit failures with Wave32.
+       */
+      sscreen->ge_wave_size = 32;
+
+      if (sscreen->debug_flags & DBG(W32_GE))
+         sscreen->ge_wave_size = 32;
+      if (sscreen->debug_flags & DBG(W32_PS))
+         sscreen->ps_wave_size = 32;
+      if (sscreen->debug_flags & DBG(W32_CS))
+         sscreen->compute_wave_size = 32;
+
+      if (sscreen->debug_flags & DBG(W64_GE))
+         sscreen->ge_wave_size = 64;
+      if (sscreen->debug_flags & DBG(W64_PS))
+         sscreen->ps_wave_size = 64;
+      if (sscreen->debug_flags & DBG(W64_CS))
+         sscreen->compute_wave_size = 64;
+   }
+
+   /* Create the auxiliary context. This must be done last. */
+   sscreen->aux_context = si_create_context(
+      &sscreen->b, (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
+                      (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY));
+   if (sscreen->options.aux_debug) {
+      struct u_log_context *log = CALLOC_STRUCT(u_log_context);
+      u_log_context_init(log);
+      sscreen->aux_context->set_log_context(sscreen->aux_context, log);
+   }
+
+   if (test_flags & DBG(TEST_DMA))
+      si_test_dma(sscreen);
+
+   if (test_flags & DBG(TEST_DMA_PERF)) {
+      si_test_dma_perf(sscreen);
+   }
+
+   if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SDMA) | DBG(TEST_VMFAULT_SHADER)))
+      si_test_vmfault(sscreen, test_flags);
+
+   if (test_flags & DBG(TEST_GDS))
+      si_test_gds((struct si_context *)sscreen->aux_context);
+
+   if (test_flags & DBG(TEST_GDS_MM)) {
+      si_test_gds_memory_management((struct si_context *)sscreen->aux_context, 32 * 1024, 4,
+                                    RADEON_DOMAIN_GDS);
+   }
+   if (test_flags & DBG(TEST_GDS_OA_MM)) {
+      si_test_gds_memory_management((struct si_context *)sscreen->aux_context, 4, 1,
+                                    RADEON_DOMAIN_OA);
+   }
+
+   STATIC_ASSERT(sizeof(union si_vgt_stages_key) == 4);
+   return &sscreen->b;
  }
  
  struct pipe_screen *radeonsi_screen_create(int fd, const struct pipe_screen_config *config)
  {
-       drmVersionPtr version = drmGetVersion(fd);
-       struct radeon_winsys *rw = NULL;
-
-       switch (version->version_major) {
-       case 2:
-               rw = radeon_drm_winsys_create(fd, config, radeonsi_screen_create_impl);
-               break;
-       case 3:
-               rw = amdgpu_winsys_create(fd, config, radeonsi_screen_create_impl);
-               break;
-       }
-
-       drmFreeVersion(version);
-       return rw ? rw->screen : NULL;
+   drmVersionPtr version = drmGetVersion(fd);
+   struct radeon_winsys *rw = NULL;
+
+   switch (version->version_major) {
+   case 2:
+      rw = radeon_drm_winsys_create(fd, config, radeonsi_screen_create_impl);
+      break;
+   case 3:
+      rw = amdgpu_winsys_create(fd, config, radeonsi_screen_create_impl);
+      break;
+   }
+
+   drmFreeVersion(version);
+   return rw ? rw->screen : NULL;
  }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h

index 400f2152243aa41d7f4fad3a6c05eda411f514ec..30f7832f71cece67eaacc50e970cd59abd3a9aed 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -27,7 +27,6 @@
  
  #include "si_shader.h"
  #include "si_state.h"
-
  #include "util/u_dynarray.h"
  #include "util/u_idalloc.h"
  #include "util/u_threaded_context.h"
@@ -38,201 +37,207 @@
  #define SI_BIG_ENDIAN 0
  #endif
  
-#define ATI_VENDOR_ID                  0x1002
-#define SI_PRIM_DISCARD_DEBUG          0
-#define SI_NOT_QUERY                   0xffffffff
+#define ATI_VENDOR_ID         0x1002
+#define SI_PRIM_DISCARD_DEBUG 0
+#define SI_NOT_QUERY          0xffffffff
  
  /* The base vertex and primitive restart can be any number, but we must pick
   * one which will mean "unknown" for the purpose of state tracking and
   * the number shouldn't be a commonly-used one. */
-#define SI_BASE_VERTEX_UNKNOWN         INT_MIN
-#define SI_RESTART_INDEX_UNKNOWN       INT_MIN
-#define SI_INSTANCE_COUNT_UNKNOWN      INT_MIN
-#define SI_NUM_SMOOTH_AA_SAMPLES       8
-#define SI_MAX_POINT_SIZE              2048
-#define SI_GS_PER_ES                   128
+#define SI_BASE_VERTEX_UNKNOWN    INT_MIN
+#define SI_RESTART_INDEX_UNKNOWN  INT_MIN
+#define SI_INSTANCE_COUNT_UNKNOWN INT_MIN
+#define SI_NUM_SMOOTH_AA_SAMPLES  8
+#define SI_MAX_POINT_SIZE         2048
+#define SI_GS_PER_ES              128
  /* Alignment for optimal CP DMA performance. */
-#define SI_CPDMA_ALIGNMENT             32
+#define SI_CPDMA_ALIGNMENT 32
  
  /* Tunables for compute-based clear_buffer and copy_buffer: */
-#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
-#define SI_COMPUTE_COPY_DW_PER_THREAD  4
-#define SI_COMPUTE_DST_CACHE_POLICY    L2_STREAM
+#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
+#define SI_COMPUTE_COPY_DW_PER_THREAD  4
+#define SI_COMPUTE_DST_CACHE_POLICY    L2_STREAM
  
  /* Pipeline & streamout query controls. */
-#define SI_CONTEXT_START_PIPELINE_STATS        (1 << 0)
-#define SI_CONTEXT_STOP_PIPELINE_STATS (1 << 1)
+#define SI_CONTEXT_START_PIPELINE_STATS  (1 << 0)
+#define SI_CONTEXT_STOP_PIPELINE_STATS   (1 << 1)
  #define SI_CONTEXT_FLUSH_FOR_RENDER_COND (1 << 2)
  /* Instruction cache. */
-#define SI_CONTEXT_INV_ICACHE          (1 << 3)
+#define SI_CONTEXT_INV_ICACHE (1 << 3)
  /* Scalar cache. (GFX6-9: scalar L1; GFX10: scalar L0)
   * GFX10: This also invalidates the L1 shader array cache. */
-#define SI_CONTEXT_INV_SCACHE          (1 << 4)
+#define SI_CONTEXT_INV_SCACHE (1 << 4)
  /* Vector cache. (GFX6-9: vector L1; GFX10: vector L0)
   * GFX10: This also invalidates the L1 shader array cache. */
-#define SI_CONTEXT_INV_VCACHE          (1 << 5)
+#define SI_CONTEXT_INV_VCACHE (1 << 5)
  /* L2 cache + L2 metadata cache writeback & invalidate.
   * GFX6-8: Used by shaders only. GFX9-10: Used by everything. */
-#define SI_CONTEXT_INV_L2              (1 << 6)
+#define SI_CONTEXT_INV_L2 (1 << 6)
  /* L2 writeback (write dirty L2 lines to memory for non-L2 clients).
   * Only used for coherency with non-L2 clients like CB, DB, CP on GFX6-8.
   * GFX6-7 will do complete invalidation, because the writeback is unsupported. */
-#define SI_CONTEXT_WB_L2               (1 << 7)
+#define SI_CONTEXT_WB_L2 (1 << 7)
  /* Writeback & invalidate the L2 metadata cache only. It can only be coupled with
   * a CB or DB flush. */
-#define SI_CONTEXT_INV_L2_METADATA     (1 << 8)
+#define SI_CONTEXT_INV_L2_METADATA (1 << 8)
  /* Framebuffer caches. */
-#define SI_CONTEXT_FLUSH_AND_INV_DB    (1 << 9)
+#define SI_CONTEXT_FLUSH_AND_INV_DB      (1 << 9)
  #define SI_CONTEXT_FLUSH_AND_INV_DB_META (1 << 10)
-#define SI_CONTEXT_FLUSH_AND_INV_CB    (1 << 11)
+#define SI_CONTEXT_FLUSH_AND_INV_CB      (1 << 11)
  /* Engine synchronization. */
-#define SI_CONTEXT_VS_PARTIAL_FLUSH    (1 << 12)
-#define SI_CONTEXT_PS_PARTIAL_FLUSH    (1 << 13)
-#define SI_CONTEXT_CS_PARTIAL_FLUSH    (1 << 14)
-#define SI_CONTEXT_VGT_FLUSH           (1 << 15)
-#define SI_CONTEXT_VGT_STREAMOUT_SYNC  (1 << 16)
-
-#define SI_PREFETCH_VBO_DESCRIPTORS    (1 << 0)
-#define SI_PREFETCH_LS                 (1 << 1)
-#define SI_PREFETCH_HS                 (1 << 2)
-#define SI_PREFETCH_ES                 (1 << 3)
-#define SI_PREFETCH_GS                 (1 << 4)
-#define SI_PREFETCH_VS                 (1 << 5)
-#define SI_PREFETCH_PS                 (1 << 6)
-
-#define SI_MAX_BORDER_COLORS           4096
-#define SI_MAX_VIEWPORTS               16
-#define SIX_BITS                       0x3F
-#define SI_MAP_BUFFER_ALIGNMENT                64
+#define SI_CONTEXT_VS_PARTIAL_FLUSH   (1 << 12)
+#define SI_CONTEXT_PS_PARTIAL_FLUSH   (1 << 13)
+#define SI_CONTEXT_CS_PARTIAL_FLUSH   (1 << 14)
+#define SI_CONTEXT_VGT_FLUSH          (1 << 15)
+#define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16)
+
+#define SI_PREFETCH_VBO_DESCRIPTORS (1 << 0)
+#define SI_PREFETCH_LS              (1 << 1)
+#define SI_PREFETCH_HS              (1 << 2)
+#define SI_PREFETCH_ES              (1 << 3)
+#define SI_PREFETCH_GS              (1 << 4)
+#define SI_PREFETCH_VS              (1 << 5)
+#define SI_PREFETCH_PS              (1 << 6)
+
+#define SI_MAX_BORDER_COLORS              4096
+#define SI_MAX_VIEWPORTS                  16
+#define SIX_BITS                          0x3F
+#define SI_MAP_BUFFER_ALIGNMENT           64
  #define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024
  
-#define SI_RESOURCE_FLAG_TRANSFER      (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
-#define SI_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
+#define SI_RESOURCE_FLAG_TRANSFER          (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
+#define SI_RESOURCE_FLAG_FLUSHED_DEPTH     (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
  #define SI_RESOURCE_FLAG_FORCE_MSAA_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
-#define SI_RESOURCE_FLAG_DISABLE_DCC   (PIPE_RESOURCE_FLAG_DRV_PRIV << 3)
-#define SI_RESOURCE_FLAG_UNMAPPABLE    (PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
-#define SI_RESOURCE_FLAG_READ_ONLY     (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
-#define SI_RESOURCE_FLAG_32BIT         (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
-#define SI_RESOURCE_FLAG_CLEAR         (PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
+#define SI_RESOURCE_FLAG_DISABLE_DCC       (PIPE_RESOURCE_FLAG_DRV_PRIV << 3)
+#define SI_RESOURCE_FLAG_UNMAPPABLE        (PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
+#define SI_RESOURCE_FLAG_READ_ONLY         (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
+#define SI_RESOURCE_FLAG_32BIT             (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
+#define SI_RESOURCE_FLAG_CLEAR             (PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
  /* For const_uploader, upload data via GTT and copy to VRAM on context flush via SDMA. */
-#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA  (PIPE_RESOURCE_FLAG_DRV_PRIV << 8)
+#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA (PIPE_RESOURCE_FLAG_DRV_PRIV << 8)
  /* Set a micro tile mode: */
-#define SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE  (PIPE_RESOURCE_FLAG_DRV_PRIV << 9)
-#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT  (util_logbase2(PIPE_RESOURCE_FLAG_DRV_PRIV) + 10)
-#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(x) (((x) & 0x3) << SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT)
-#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(x) (((x) >> SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) & 0x3)
+#define SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE (PIPE_RESOURCE_FLAG_DRV_PRIV << 9)
+#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT (util_logbase2(PIPE_RESOURCE_FLAG_DRV_PRIV) + 10)
+#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(x)                                                    \
+   (((x)&0x3) << SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT)
+#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(x)                                                    \
+   (((x) >> SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) & 0x3)
  
  enum si_clear_code
  {
-       DCC_CLEAR_COLOR_0000   = 0x00000000,
-       DCC_CLEAR_COLOR_0001   = 0x40404040,
-       DCC_CLEAR_COLOR_1110   = 0x80808080,
-       DCC_CLEAR_COLOR_1111   = 0xC0C0C0C0,
-       DCC_CLEAR_COLOR_REG    = 0x20202020,
-       DCC_UNCOMPRESSED       = 0xFFFFFFFF,
+   DCC_CLEAR_COLOR_0000 = 0x00000000,
+   DCC_CLEAR_COLOR_0001 = 0x40404040,
+   DCC_CLEAR_COLOR_1110 = 0x80808080,
+   DCC_CLEAR_COLOR_1111 = 0xC0C0C0C0,
+   DCC_CLEAR_COLOR_REG = 0x20202020,
+   DCC_UNCOMPRESSED = 0xFFFFFFFF,
  };
  
-#define SI_IMAGE_ACCESS_AS_BUFFER      (1 << 7)
+#define SI_IMAGE_ACCESS_AS_BUFFER (1 << 7)
  
  /* Debug flags. */
-enum {
-       /* Shader logging options: */
-       DBG_VS = PIPE_SHADER_VERTEX,
-       DBG_PS = PIPE_SHADER_FRAGMENT,
-       DBG_GS = PIPE_SHADER_GEOMETRY,
-       DBG_TCS = PIPE_SHADER_TESS_CTRL,
-       DBG_TES = PIPE_SHADER_TESS_EVAL,
-       DBG_CS = PIPE_SHADER_COMPUTE,
-       DBG_NO_IR,
-       DBG_NO_NIR,
-       DBG_NO_ASM,
-       DBG_PREOPT_IR,
-
-       /* Shader compiler options the shader cache should be aware of: */
-       DBG_FS_CORRECT_DERIVS_AFTER_KILL,
-       DBG_GISEL,
-       DBG_W32_GE,
-       DBG_W32_PS,
-       DBG_W32_CS,
-       DBG_W64_GE,
-       DBG_W64_PS,
-       DBG_W64_CS,
-
-       /* Shader compiler options (with no effect on the shader cache): */
-       DBG_CHECK_IR,
-       DBG_MONOLITHIC_SHADERS,
-       DBG_NO_OPT_VARIANT,
-
-       /* Information logging options: */
-       DBG_INFO,
-       DBG_TEX,
-       DBG_COMPUTE,
-       DBG_VM,
-       DBG_CACHE_STATS,
-
-       /* Driver options: */
-       DBG_FORCE_SDMA,
-       DBG_NO_SDMA,
-       DBG_NO_SDMA_CLEARS,
-       DBG_NO_SDMA_COPY_IMAGE,
-       DBG_NO_WC,
-       DBG_CHECK_VM,
-       DBG_RESERVE_VMID,
-       DBG_ZERO_VRAM,
-
-       /* 3D engine options: */
-       DBG_NO_GFX,
-       DBG_NO_NGG,
-       DBG_ALWAYS_NGG_CULLING,
-       DBG_NO_NGG_CULLING,
-       DBG_ALWAYS_PD,
-       DBG_PD,
-       DBG_NO_PD,
-       DBG_SWITCH_ON_EOP,
-       DBG_NO_OUT_OF_ORDER,
-       DBG_NO_DPBB,
-       DBG_NO_DFSM,
-       DBG_DPBB,
-       DBG_DFSM,
-       DBG_NO_HYPERZ,
-       DBG_NO_RB_PLUS,
-       DBG_NO_2D_TILING,
-       DBG_NO_TILING,
-       DBG_NO_DCC,
-       DBG_NO_DCC_CLEAR,
-       DBG_NO_DCC_FB,
-       DBG_NO_DCC_MSAA,
-       DBG_NO_FMASK,
-
-       DBG_COUNT
+enum
+{
+   /* Shader logging options: */
+   DBG_VS = PIPE_SHADER_VERTEX,
+   DBG_PS = PIPE_SHADER_FRAGMENT,
+   DBG_GS = PIPE_SHADER_GEOMETRY,
+   DBG_TCS = PIPE_SHADER_TESS_CTRL,
+   DBG_TES = PIPE_SHADER_TESS_EVAL,
+   DBG_CS = PIPE_SHADER_COMPUTE,
+   DBG_NO_IR,
+   DBG_NO_NIR,
+   DBG_NO_ASM,
+   DBG_PREOPT_IR,
+
+   /* Shader compiler options the shader cache should be aware of: */
+   DBG_FS_CORRECT_DERIVS_AFTER_KILL,
+   DBG_GISEL,
+   DBG_W32_GE,
+   DBG_W32_PS,
+   DBG_W32_CS,
+   DBG_W64_GE,
+   DBG_W64_PS,
+   DBG_W64_CS,
+
+   /* Shader compiler options (with no effect on the shader cache): */
+   DBG_CHECK_IR,
+   DBG_MONOLITHIC_SHADERS,
+   DBG_NO_OPT_VARIANT,
+
+   /* Information logging options: */
+   DBG_INFO,
+   DBG_TEX,
+   DBG_COMPUTE,
+   DBG_VM,
+   DBG_CACHE_STATS,
+
+   /* Driver options: */
+   DBG_FORCE_SDMA,
+   DBG_NO_SDMA,
+   DBG_NO_SDMA_CLEARS,
+   DBG_NO_SDMA_COPY_IMAGE,
+   DBG_NO_WC,
+   DBG_CHECK_VM,
+   DBG_RESERVE_VMID,
+   DBG_ZERO_VRAM,
+
+   /* 3D engine options: */
+   DBG_NO_GFX,
+   DBG_NO_NGG,
+   DBG_ALWAYS_NGG_CULLING,
+   DBG_NO_NGG_CULLING,
+   DBG_ALWAYS_PD,
+   DBG_PD,
+   DBG_NO_PD,
+   DBG_SWITCH_ON_EOP,
+   DBG_NO_OUT_OF_ORDER,
+   DBG_NO_DPBB,
+   DBG_NO_DFSM,
+   DBG_DPBB,
+   DBG_DFSM,
+   DBG_NO_HYPERZ,
+   DBG_NO_RB_PLUS,
+   DBG_NO_2D_TILING,
+   DBG_NO_TILING,
+   DBG_NO_DCC,
+   DBG_NO_DCC_CLEAR,
+   DBG_NO_DCC_FB,
+   DBG_NO_DCC_MSAA,
+   DBG_NO_FMASK,
+
+   DBG_COUNT
  };
  
-enum {
-       /* Tests: */
-       DBG_TEST_DMA,
-       DBG_TEST_VMFAULT_CP,
-       DBG_TEST_VMFAULT_SDMA,
-       DBG_TEST_VMFAULT_SHADER,
-       DBG_TEST_DMA_PERF,
-       DBG_TEST_GDS,
-       DBG_TEST_GDS_MM,
-       DBG_TEST_GDS_OA_MM,
+enum
+{
+   /* Tests: */
+   DBG_TEST_DMA,
+   DBG_TEST_VMFAULT_CP,
+   DBG_TEST_VMFAULT_SDMA,
+   DBG_TEST_VMFAULT_SHADER,
+   DBG_TEST_DMA_PERF,
+   DBG_TEST_GDS,
+   DBG_TEST_GDS_MM,
+   DBG_TEST_GDS_OA_MM,
  };
  
-#define DBG_ALL_SHADERS                (((1 << (DBG_CS + 1)) - 1))
-#define DBG(name)              (1ull << DBG_##name)
+#define DBG_ALL_SHADERS (((1 << (DBG_CS + 1)) - 1))
+#define DBG(name)       (1ull << DBG_##name)
  
-enum si_cache_policy {
-       L2_BYPASS,
-       L2_STREAM, /* same as SLC=1 */
-       L2_LRU,    /* same as SLC=0 */
+enum si_cache_policy
+{
+   L2_BYPASS,
+   L2_STREAM, /* same as SLC=1 */
+   L2_LRU,    /* same as SLC=0 */
  };
  
-enum si_coherency {
-       SI_COHERENCY_NONE, /* no cache flushes needed */
-       SI_COHERENCY_SHADER,
-       SI_COHERENCY_CB_META,
-       SI_COHERENCY_CP,
+enum si_coherency
+{
+   SI_COHERENCY_NONE, /* no cache flushes needed */
+   SI_COHERENCY_SHADER,
+   SI_COHERENCY_CB_META,
+   SI_COHERENCY_CP,
  };
  
  struct si_compute;
@@ -244,528 +249,523 @@ struct u_suballocator;
   * at the moment.
   */
  struct si_resource {
-       struct threaded_resource        b;
-
-       /* Winsys objects. */
-       struct pb_buffer                *buf;
-       uint64_t                        gpu_address;
-       /* Memory usage if the buffer placement is optimal. */
-       uint64_t                        vram_usage;
-       uint64_t                        gart_usage;
-
-       /* Resource properties. */
-       uint64_t                        bo_size;
-       unsigned                        bo_alignment;
-       enum radeon_bo_domain           domains;
-       enum radeon_bo_flag             flags;
-       unsigned                        bind_history;
-       int                             max_forced_staging_uploads;
-
-       /* The buffer range which is initialized (with a write transfer,
-        * streamout, DMA, or as a random access target). The rest of
-        * the buffer is considered invalid and can be mapped unsynchronized.
-        *
-        * This allows unsychronized mapping of a buffer range which hasn't
-        * been used yet. It's for applications which forget to use
-        * the unsynchronized map flag and expect the driver to figure it out.
-         */
-       struct util_range               valid_buffer_range;
-
-       /* For buffers only. This indicates that a write operation has been
-        * performed by TC L2, but the cache hasn't been flushed.
-        * Any hw block which doesn't use or bypasses TC L2 should check this
-        * flag and flush the cache before using the buffer.
-        *
-        * For example, TC L2 must be flushed if a buffer which has been
-        * modified by a shader store instruction is about to be used as
-        * an index buffer. The reason is that VGT DMA index fetching doesn't
-        * use TC L2.
-        */
-       bool                            TC_L2_dirty;
-
-       /* Whether this resource is referenced by bindless handles. */
-       bool                            texture_handle_allocated;
-       bool                            image_handle_allocated;
-
-       /* Whether the resource has been exported via resource_get_handle. */
-       unsigned                        external_usage; /* PIPE_HANDLE_USAGE_* */
+   struct threaded_resource b;
+
+   /* Winsys objects. */
+   struct pb_buffer *buf;
+   uint64_t gpu_address;
+   /* Memory usage if the buffer placement is optimal. */
+   uint64_t vram_usage;
+   uint64_t gart_usage;
+
+   /* Resource properties. */
+   uint64_t bo_size;
+   unsigned bo_alignment;
+   enum radeon_bo_domain domains;
+   enum radeon_bo_flag flags;
+   unsigned bind_history;
+   int max_forced_staging_uploads;
+
+   /* The buffer range which is initialized (with a write transfer,
+    * streamout, DMA, or as a random access target). The rest of
+    * the buffer is considered invalid and can be mapped unsynchronized.
+    *
+    * This allows unsychronized mapping of a buffer range which hasn't
+    * been used yet. It's for applications which forget to use
+    * the unsynchronized map flag and expect the driver to figure it out.
+    */
+   struct util_range valid_buffer_range;
+
+   /* For buffers only. This indicates that a write operation has been
+    * performed by TC L2, but the cache hasn't been flushed.
+    * Any hw block which doesn't use or bypasses TC L2 should check this
+    * flag and flush the cache before using the buffer.
+    *
+    * For example, TC L2 must be flushed if a buffer which has been
+    * modified by a shader store instruction is about to be used as
+    * an index buffer. The reason is that VGT DMA index fetching doesn't
+    * use TC L2.
+    */
+   bool TC_L2_dirty;
+
+   /* Whether this resource is referenced by bindless handles. */
+   bool texture_handle_allocated;
+   bool image_handle_allocated;
+
+   /* Whether the resource has been exported via resource_get_handle. */
+   unsigned external_usage; /* PIPE_HANDLE_USAGE_* */
  };
  
  struct si_transfer {
-       struct threaded_transfer        b;
-       struct si_resource              *staging;
-       unsigned                        offset;
+   struct threaded_transfer b;
+   struct si_resource *staging;
+   unsigned offset;
  };
  
  struct si_texture {
-       struct si_resource              buffer;
-
-       struct radeon_surf              surface;
-       struct si_texture               *flushed_depth_texture;
-
-       /* One texture allocation can contain these buffers:
-        * - image (pixel data)
-        * - FMASK buffer (MSAA compression)
-        * - CMASK buffer (MSAA compression and/or legacy fast color clear)
-        * - HTILE buffer (Z/S compression and fast Z/S clear)
-        * - DCC buffer (color compression and new fast color clear)
-        * - displayable DCC buffer (if the DCC buffer is not displayable)
-        * - DCC retile mapping buffer (if the DCC buffer is not displayable)
-        */
-       uint64_t                        cmask_base_address_reg;
-       struct si_resource              *cmask_buffer;
-       unsigned                        cb_color_info; /* fast clear enable bit */
-       unsigned                        color_clear_value[2];
-       unsigned                        last_msaa_resolve_target_micro_mode;
-       unsigned                        num_level0_transfers;
-       unsigned                        plane_index; /* other planes are different pipe_resources */
-       unsigned                        num_planes;
-
-       /* Depth buffer compression and fast clear. */
-       float                           depth_clear_value;
-       uint16_t                        dirty_level_mask; /* each bit says if that mipmap is compressed */
-       uint16_t                        stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
-       enum pipe_format                db_render_format:16;
-       uint8_t                         stencil_clear_value;
-       bool                            fmask_is_identity:1;
-       bool                            tc_compatible_htile:1;
-       bool                            htile_stencil_disabled:1;
-       bool                            depth_cleared:1; /* if it was cleared at least once */
-       bool                            stencil_cleared:1; /* if it was cleared at least once */
-       bool                            upgraded_depth:1; /* upgraded from unorm to Z32_FLOAT */
-       bool                            is_depth:1;
-       bool                            db_compatible:1;
-       bool                            can_sample_z:1;
-       bool                            can_sample_s:1;
-
-       /* We need to track DCC dirtiness, because st/dri usually calls
-        * flush_resource twice per frame (not a bug) and we don't wanna
-        * decompress DCC twice. Also, the dirty tracking must be done even
-        * if DCC isn't used, because it's required by the DCC usage analysis
-        * for a possible future enablement.
-        */
-       bool                            separate_dcc_dirty:1;
-       bool                            displayable_dcc_dirty:1;
-
-       /* Statistics gathering for the DCC enablement heuristic. */
-       bool                            dcc_gather_statistics:1;
-       /* Counter that should be non-zero if the texture is bound to a
-        * framebuffer.
-        */
-       unsigned                        framebuffers_bound;
-       /* Whether the texture is a displayable back buffer and needs DCC
-        * decompression, which is expensive. Therefore, it's enabled only
-        * if statistics suggest that it will pay off and it's allocated
-        * separately. It can't be bound as a sampler by apps. Limited to
-        * target == 2D and last_level == 0. If enabled, dcc_offset contains
-        * the absolute GPUVM address, not the relative one.
-        */
-       struct si_resource              *dcc_separate_buffer;
-       /* When DCC is temporarily disabled, the separate buffer is here. */
-       struct si_resource              *last_dcc_separate_buffer;
-       /* Estimate of how much this color buffer is written to in units of
-        * full-screen draws: ps_invocations / (width * height)
-        * Shader kills, late Z, and blending with trivial discards make it
-        * inaccurate (we need to count CB updates, not PS invocations).
-        */
-       unsigned                        ps_draw_ratio;
-       /* The number of clears since the last DCC usage analysis. */
-       unsigned                        num_slow_clears;
+   struct si_resource buffer;
+
+   struct radeon_surf surface;
+   struct si_texture *flushed_depth_texture;
+
+   /* One texture allocation can contain these buffers:
+    * - image (pixel data)
+    * - FMASK buffer (MSAA compression)
+    * - CMASK buffer (MSAA compression and/or legacy fast color clear)
+    * - HTILE buffer (Z/S compression and fast Z/S clear)
+    * - DCC buffer (color compression and new fast color clear)
+    * - displayable DCC buffer (if the DCC buffer is not displayable)
+    * - DCC retile mapping buffer (if the DCC buffer is not displayable)
+    */
+   uint64_t cmask_base_address_reg;
+   struct si_resource *cmask_buffer;
+   unsigned cb_color_info; /* fast clear enable bit */
+   unsigned color_clear_value[2];
+   unsigned last_msaa_resolve_target_micro_mode;
+   unsigned num_level0_transfers;
+   unsigned plane_index; /* other planes are different pipe_resources */
+   unsigned num_planes;
+
+   /* Depth buffer compression and fast clear. */
+   float depth_clear_value;
+   uint16_t dirty_level_mask;         /* each bit says if that mipmap is compressed */
+   uint16_t stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
+   enum pipe_format db_render_format : 16;
+   uint8_t stencil_clear_value;
+   bool fmask_is_identity : 1;
+   bool tc_compatible_htile : 1;
+   bool htile_stencil_disabled : 1;
+   bool depth_cleared : 1;   /* if it was cleared at least once */
+   bool stencil_cleared : 1; /* if it was cleared at least once */
+   bool upgraded_depth : 1;  /* upgraded from unorm to Z32_FLOAT */
+   bool is_depth : 1;
+   bool db_compatible : 1;
+   bool can_sample_z : 1;
+   bool can_sample_s : 1;
+
+   /* We need to track DCC dirtiness, because st/dri usually calls
+    * flush_resource twice per frame (not a bug) and we don't wanna
+    * decompress DCC twice. Also, the dirty tracking must be done even
+    * if DCC isn't used, because it's required by the DCC usage analysis
+    * for a possible future enablement.
+    */
+   bool separate_dcc_dirty : 1;
+   bool displayable_dcc_dirty : 1;
+
+   /* Statistics gathering for the DCC enablement heuristic. */
+   bool dcc_gather_statistics : 1;
+   /* Counter that should be non-zero if the texture is bound to a
+    * framebuffer.
+    */
+   unsigned framebuffers_bound;
+   /* Whether the texture is a displayable back buffer and needs DCC
+    * decompression, which is expensive. Therefore, it's enabled only
+    * if statistics suggest that it will pay off and it's allocated
+    * separately. It can't be bound as a sampler by apps. Limited to
+    * target == 2D and last_level == 0. If enabled, dcc_offset contains
+    * the absolute GPUVM address, not the relative one.
+    */
+   struct si_resource *dcc_separate_buffer;
+   /* When DCC is temporarily disabled, the separate buffer is here. */
+   struct si_resource *last_dcc_separate_buffer;
+   /* Estimate of how much this color buffer is written to in units of
+    * full-screen draws: ps_invocations / (width * height)
+    * Shader kills, late Z, and blending with trivial discards make it
+    * inaccurate (we need to count CB updates, not PS invocations).
+    */
+   unsigned ps_draw_ratio;
+   /* The number of clears since the last DCC usage analysis. */
+   unsigned num_slow_clears;
  };
  
  struct si_surface {
-       struct pipe_surface             base;
-
-       /* These can vary with block-compressed textures. */
-       uint16_t width0;
-       uint16_t height0;
-
-       bool color_initialized:1;
-       bool depth_initialized:1;
-
-       /* Misc. color flags. */
-       bool color_is_int8:1;
-       bool color_is_int10:1;
-       bool dcc_incompatible:1;
-
-       /* Color registers. */
-       unsigned cb_color_info;
-       unsigned cb_color_view;
-       unsigned cb_color_attrib;
-       unsigned cb_color_attrib2;      /* GFX9 and later */
-       unsigned cb_color_attrib3;      /* GFX10 and later */
-       unsigned cb_dcc_control;        /* GFX8 and later */
-       unsigned spi_shader_col_format:8;       /* no blending, no alpha-to-coverage. */
-       unsigned spi_shader_col_format_alpha:8; /* alpha-to-coverage */
-       unsigned spi_shader_col_format_blend:8; /* blending without alpha. */
-       unsigned spi_shader_col_format_blend_alpha:8; /* blending with alpha. */
-
-       /* DB registers. */
-       uint64_t db_depth_base;         /* DB_Z_READ/WRITE_BASE */
-       uint64_t db_stencil_base;
-       uint64_t db_htile_data_base;
-       unsigned db_depth_info;
-       unsigned db_z_info;
-       unsigned db_z_info2;            /* GFX9 only */
-       unsigned db_depth_view;
-       unsigned db_depth_size;
-       unsigned db_depth_slice;
-       unsigned db_stencil_info;
-       unsigned db_stencil_info2;      /* GFX9 only */
-       unsigned db_htile_surface;
+   struct pipe_surface base;
+
+   /* These can vary with block-compressed textures. */
+   uint16_t width0;
+   uint16_t height0;
+
+   bool color_initialized : 1;
+   bool depth_initialized : 1;
+
+   /* Misc. color flags. */
+   bool color_is_int8 : 1;
+   bool color_is_int10 : 1;
+   bool dcc_incompatible : 1;
+
+   /* Color registers. */
+   unsigned cb_color_info;
+   unsigned cb_color_view;
+   unsigned cb_color_attrib;
+   unsigned cb_color_attrib2;                      /* GFX9 and later */
+   unsigned cb_color_attrib3;                      /* GFX10 and later */
+   unsigned cb_dcc_control;                        /* GFX8 and later */
+   unsigned spi_shader_col_format : 8;             /* no blending, no alpha-to-coverage. */
+   unsigned spi_shader_col_format_alpha : 8;       /* alpha-to-coverage */
+   unsigned spi_shader_col_format_blend : 8;       /* blending without alpha. */
+   unsigned spi_shader_col_format_blend_alpha : 8; /* blending with alpha. */
+
+   /* DB registers. */
+   uint64_t db_depth_base; /* DB_Z_READ/WRITE_BASE */
+   uint64_t db_stencil_base;
+   uint64_t db_htile_data_base;
+   unsigned db_depth_info;
+   unsigned db_z_info;
+   unsigned db_z_info2; /* GFX9 only */
+   unsigned db_depth_view;
+   unsigned db_depth_size;
+   unsigned db_depth_slice;
+   unsigned db_stencil_info;
+   unsigned db_stencil_info2; /* GFX9 only */
+   unsigned db_htile_surface;
  };
  
  struct si_mmio_counter {
-       unsigned busy;
-       unsigned idle;
+   unsigned busy;
+   unsigned idle;
  };
  
  union si_mmio_counters {
-       struct {
-               /* For global GPU load including SDMA. */
-               struct si_mmio_counter gpu;
-
-               /* GRBM_STATUS */
-               struct si_mmio_counter spi;
-               struct si_mmio_counter gui;
-               struct si_mmio_counter ta;
-               struct si_mmio_counter gds;
-               struct si_mmio_counter vgt;
-               struct si_mmio_counter ia;
-               struct si_mmio_counter sx;
-               struct si_mmio_counter wd;
-               struct si_mmio_counter bci;
-               struct si_mmio_counter sc;
-               struct si_mmio_counter pa;
-               struct si_mmio_counter db;
-               struct si_mmio_counter cp;
-               struct si_mmio_counter cb;
-
-               /* SRBM_STATUS2 */
-               struct si_mmio_counter sdma;
-
-               /* CP_STAT */
-               struct si_mmio_counter pfp;
-               struct si_mmio_counter meq;
-               struct si_mmio_counter me;
-               struct si_mmio_counter surf_sync;
-               struct si_mmio_counter cp_dma;
-               struct si_mmio_counter scratch_ram;
-       } named;
-       unsigned array[0];
+   struct {
+      /* For global GPU load including SDMA. */
+      struct si_mmio_counter gpu;
+
+      /* GRBM_STATUS */
+      struct si_mmio_counter spi;
+      struct si_mmio_counter gui;
+      struct si_mmio_counter ta;
+      struct si_mmio_counter gds;
+      struct si_mmio_counter vgt;
+      struct si_mmio_counter ia;
+      struct si_mmio_counter sx;
+      struct si_mmio_counter wd;
+      struct si_mmio_counter bci;
+      struct si_mmio_counter sc;
+      struct si_mmio_counter pa;
+      struct si_mmio_counter db;
+      struct si_mmio_counter cp;
+      struct si_mmio_counter cb;
+
+      /* SRBM_STATUS2 */
+      struct si_mmio_counter sdma;
+
+      /* CP_STAT */
+      struct si_mmio_counter pfp;
+      struct si_mmio_counter meq;
+      struct si_mmio_counter me;
+      struct si_mmio_counter surf_sync;
+      struct si_mmio_counter cp_dma;
+      struct si_mmio_counter scratch_ram;
+   } named;
+   unsigned array[0];
  };
  
  struct si_memory_object {
-       struct pipe_memory_object       b;
-       struct pb_buffer                *buf;
-       uint32_t                        stride;
+   struct pipe_memory_object b;
+   struct pb_buffer *buf;
+   uint32_t stride;
  };
  
  /* Saved CS data for debugging features. */
  struct radeon_saved_cs {
-       uint32_t                        *ib;
-       unsigned                        num_dw;
+   uint32_t *ib;
+   unsigned num_dw;
  
-       struct radeon_bo_list_item      *bo_list;
-       unsigned                        bo_count;
+   struct radeon_bo_list_item *bo_list;
+   unsigned bo_count;
  };
  
  struct si_screen {
-       struct pipe_screen              b;
-       struct radeon_winsys            *ws;
-       struct disk_cache               *disk_shader_cache;
-
-       struct radeon_info              info;
-       uint64_t                        debug_flags;
-       char                            renderer_string[183];
-
-       void (*make_texture_descriptor)(
-                       struct si_screen *screen,
-                       struct si_texture *tex,
-                       bool sampler,
-                       enum pipe_texture_target target,
-                       enum pipe_format pipe_format,
-                       const unsigned char state_swizzle[4],
-                       unsigned first_level, unsigned last_level,
-                       unsigned first_layer, unsigned last_layer,
-                       unsigned width, unsigned height, unsigned depth,
-                       uint32_t *state,
-                       uint32_t *fmask_state);
-
-       unsigned                        num_vbos_in_user_sgprs;
-       unsigned                        pa_sc_raster_config;
-       unsigned                        pa_sc_raster_config_1;
-       unsigned                        se_tile_repeat;
-       unsigned                        gs_table_depth;
-       unsigned                        tess_offchip_block_dw_size;
-       unsigned                        tess_offchip_ring_size;
-       unsigned                        tess_factor_ring_size;
-       unsigned                        vgt_hs_offchip_param;
-       unsigned                        eqaa_force_coverage_samples;
-       unsigned                        eqaa_force_z_samples;
-       unsigned                        eqaa_force_color_samples;
-       bool                            has_draw_indirect_multi;
-       bool                            has_out_of_order_rast;
-       bool                            assume_no_z_fights;
-       bool                            commutative_blend_add;
-       bool                            dpbb_allowed;
-       bool                            dfsm_allowed;
-       bool                            llvm_has_working_vgpr_indexing;
-       bool                            use_ngg;
-       bool                            use_ngg_culling;
-       bool                            always_use_ngg_culling;
-       bool                            use_ngg_streamout;
-
-       struct {
-#define OPT_BOOL(name, dflt, description) bool name:1;
+   struct pipe_screen b;
+   struct radeon_winsys *ws;
+   struct disk_cache *disk_shader_cache;
+
+   struct radeon_info info;
+   uint64_t debug_flags;
+   char renderer_string[183];
+
+   void (*make_texture_descriptor)(struct si_screen *screen, struct si_texture *tex, bool sampler,
+                                   enum pipe_texture_target target, enum pipe_format pipe_format,
+                                   const unsigned char state_swizzle[4], unsigned first_level,
+                                   unsigned last_level, unsigned first_layer, unsigned last_layer,
+                                   unsigned width, unsigned height, unsigned depth, uint32_t *state,
+                                   uint32_t *fmask_state);
+
+   unsigned num_vbos_in_user_sgprs;
+   unsigned pa_sc_raster_config;
+   unsigned pa_sc_raster_config_1;
+   unsigned se_tile_repeat;
+   unsigned gs_table_depth;
+   unsigned tess_offchip_block_dw_size;
+   unsigned tess_offchip_ring_size;
+   unsigned tess_factor_ring_size;
+   unsigned vgt_hs_offchip_param;
+   unsigned eqaa_force_coverage_samples;
+   unsigned eqaa_force_z_samples;
+   unsigned eqaa_force_color_samples;
+   bool has_draw_indirect_multi;
+   bool has_out_of_order_rast;
+   bool assume_no_z_fights;
+   bool commutative_blend_add;
+   bool dpbb_allowed;
+   bool dfsm_allowed;
+   bool llvm_has_working_vgpr_indexing;
+   bool use_ngg;
+   bool use_ngg_culling;
+   bool always_use_ngg_culling;
+   bool use_ngg_streamout;
+
+   struct {
+#define OPT_BOOL(name, dflt, description) bool name : 1;
  #include "si_debug_options.h"
-       } options;
-
-       /* Whether shaders are monolithic (1-part) or separate (3-part). */
-       bool                            use_monolithic_shaders;
-       bool                            record_llvm_ir;
-       bool                            dcc_msaa_allowed;
-
-       struct slab_parent_pool         pool_transfers;
-
-       /* Texture filter settings. */
-       int                             force_aniso; /* -1 = disabled */
-
-       /* Auxiliary context. Mainly used to initialize resources.
-        * It must be locked prior to using and flushed before unlocking. */
-       struct pipe_context             *aux_context;
-       simple_mtx_t                    aux_context_lock;
-
-       /* This must be in the screen, because UE4 uses one context for
-        * compilation and another one for rendering.
-        */
-       unsigned                        num_compilations;
-       /* Along with ST_DEBUG=precompile, this should show if applications
-        * are loading shaders on demand. This is a monotonic counter.
-        */
-       unsigned                        num_shaders_created;
-       unsigned                        num_memory_shader_cache_hits;
-       unsigned                        num_memory_shader_cache_misses;
-       unsigned                        num_disk_shader_cache_hits;
-       unsigned                        num_disk_shader_cache_misses;
-
-       /* GPU load thread. */
-       simple_mtx_t                    gpu_load_mutex;
-       thrd_t                          gpu_load_thread;
-       union si_mmio_counters  mmio_counters;
-       volatile unsigned               gpu_load_stop_thread; /* bool */
-
-       /* Performance counters. */
-       struct si_perfcounters  *perfcounters;
-
-       /* If pipe_screen wants to recompute and re-emit the framebuffer,
-        * sampler, and image states of all contexts, it should atomically
-        * increment this.
-        *
-        * Each context will compare this with its own last known value of
-        * the counter before drawing and re-emit the states accordingly.
-        */
-       unsigned                        dirty_tex_counter;
-       unsigned                        dirty_buf_counter;
-
-       /* Atomically increment this counter when an existing texture's
-        * metadata is enabled or disabled in a way that requires changing
-        * contexts' compressed texture binding masks.
-        */
-       unsigned                        compressed_colortex_counter;
-
-       struct {
-               /* Context flags to set so that all writes from earlier jobs
-                * in the CP are seen by L2 clients.
-                */
-               unsigned cp_to_L2;
-
-               /* Context flags to set so that all writes from earlier jobs
-                * that end in L2 are seen by CP.
-                */
-               unsigned L2_to_cp;
-       } barrier_flags;
-
-       simple_mtx_t                    shader_parts_mutex;
-       struct si_shader_part           *vs_prologs;
-       struct si_shader_part           *tcs_epilogs;
-       struct si_shader_part           *gs_prologs;
-       struct si_shader_part           *ps_prologs;
-       struct si_shader_part           *ps_epilogs;
-
-       /* Shader cache in memory.
-        *
-        * Design & limitations:
-        * - The shader cache is per screen (= per process), never saved to
-        *   disk, and skips redundant shader compilations from NIR to bytecode.
-        * - It can only be used with one-variant-per-shader support, in which
-        *   case only the main (typically middle) part of shaders is cached.
-        * - Only VS, TCS, TES, PS are cached, out of which only the hw VS
-        *   variants of VS and TES are cached, so LS and ES aren't.
-        * - GS and CS aren't cached, but it's certainly possible to cache
-        *   those as well.
-        */
-       simple_mtx_t                    shader_cache_mutex;
-       struct hash_table               *shader_cache;
-
-       /* Shader cache of live shaders. */
-       struct util_live_shader_cache   live_shader_cache;
-
-       /* Shader compiler queue for multithreaded compilation. */
-       struct util_queue               shader_compiler_queue;
-       /* Use at most 3 normal compiler threads on quadcore and better.
-        * Hyperthreaded CPUs report the number of threads, but we want
-        * the number of cores. We only need this many threads for shader-db. */
-       struct ac_llvm_compiler         compiler[24]; /* used by the queue only */
-
-       struct util_queue               shader_compiler_queue_low_priority;
-       /* Use at most 2 low priority threads on quadcore and better.
-        * We want to minimize the impact on multithreaded Mesa. */
-       struct ac_llvm_compiler         compiler_lowp[10];
-
-       unsigned                        compute_wave_size;
-       unsigned                        ps_wave_size;
-       unsigned                        ge_wave_size;
+   } options;
+
+   /* Whether shaders are monolithic (1-part) or separate (3-part). */
+   bool use_monolithic_shaders;
+   bool record_llvm_ir;
+   bool dcc_msaa_allowed;
+
+   struct slab_parent_pool pool_transfers;
+
+   /* Texture filter settings. */
+   int force_aniso; /* -1 = disabled */
+
+   /* Auxiliary context. Mainly used to initialize resources.
+    * It must be locked prior to using and flushed before unlocking. */
+   struct pipe_context *aux_context;
+   simple_mtx_t aux_context_lock;
+
+   /* This must be in the screen, because UE4 uses one context for
+    * compilation and another one for rendering.
+    */
+   unsigned num_compilations;
+   /* Along with ST_DEBUG=precompile, this should show if applications
+    * are loading shaders on demand. This is a monotonic counter.
+    */
+   unsigned num_shaders_created;
+   unsigned num_memory_shader_cache_hits;
+   unsigned num_memory_shader_cache_misses;
+   unsigned num_disk_shader_cache_hits;
+   unsigned num_disk_shader_cache_misses;
+
+   /* GPU load thread. */
+   simple_mtx_t gpu_load_mutex;
+   thrd_t gpu_load_thread;
+   union si_mmio_counters mmio_counters;
+   volatile unsigned gpu_load_stop_thread; /* bool */
+
+   /* Performance counters. */
+   struct si_perfcounters *perfcounters;
+
+   /* If pipe_screen wants to recompute and re-emit the framebuffer,
+    * sampler, and image states of all contexts, it should atomically
+    * increment this.
+    *
+    * Each context will compare this with its own last known value of
+    * the counter before drawing and re-emit the states accordingly.
+    */
+   unsigned dirty_tex_counter;
+   unsigned dirty_buf_counter;
+
+   /* Atomically increment this counter when an existing texture's
+    * metadata is enabled or disabled in a way that requires changing
+    * contexts' compressed texture binding masks.
+    */
+   unsigned compressed_colortex_counter;
+
+   struct {
+      /* Context flags to set so that all writes from earlier jobs
+       * in the CP are seen by L2 clients.
+       */
+      unsigned cp_to_L2;
+
+      /* Context flags to set so that all writes from earlier jobs
+       * that end in L2 are seen by CP.
+       */
+      unsigned L2_to_cp;
+   } barrier_flags;
+
+   simple_mtx_t shader_parts_mutex;
+   struct si_shader_part *vs_prologs;
+   struct si_shader_part *tcs_epilogs;
+   struct si_shader_part *gs_prologs;
+   struct si_shader_part *ps_prologs;
+   struct si_shader_part *ps_epilogs;
+
+   /* Shader cache in memory.
+    *
+    * Design & limitations:
+    * - The shader cache is per screen (= per process), never saved to
+    *   disk, and skips redundant shader compilations from NIR to bytecode.
+    * - It can only be used with one-variant-per-shader support, in which
+    *   case only the main (typically middle) part of shaders is cached.
+    * - Only VS, TCS, TES, PS are cached, out of which only the hw VS
+    *   variants of VS and TES are cached, so LS and ES aren't.
+    * - GS and CS aren't cached, but it's certainly possible to cache
+    *   those as well.
+    */
+   simple_mtx_t shader_cache_mutex;
+   struct hash_table *shader_cache;
+
+   /* Shader cache of live shaders. */
+   struct util_live_shader_cache live_shader_cache;
+
+   /* Shader compiler queue for multithreaded compilation. */
+   struct util_queue shader_compiler_queue;
+   /* Use at most 3 normal compiler threads on quadcore and better.
+    * Hyperthreaded CPUs report the number of threads, but we want
+    * the number of cores. We only need this many threads for shader-db. */
+   struct ac_llvm_compiler compiler[24]; /* used by the queue only */
+
+   struct util_queue shader_compiler_queue_low_priority;
+   /* Use at most 2 low priority threads on quadcore and better.
+    * We want to minimize the impact on multithreaded Mesa. */
+   struct ac_llvm_compiler compiler_lowp[10];
+
+   unsigned compute_wave_size;
+   unsigned ps_wave_size;
+   unsigned ge_wave_size;
  };
  
  struct si_blend_color {
-       struct pipe_blend_color         state;
-       bool                            any_nonzeros;
+   struct pipe_blend_color state;
+   bool any_nonzeros;
  };
  
  struct si_sampler_view {
-       struct pipe_sampler_view        base;
-        /* [0..7] = image descriptor
-         * [4..7] = buffer descriptor */
-       uint32_t                        state[8];
-       uint32_t                        fmask_state[8];
-       const struct legacy_surf_level  *base_level_info;
-       ubyte                           base_level;
-       ubyte                           block_width;
-       bool is_stencil_sampler;
-       bool is_integer;
-       bool dcc_incompatible;
+   struct pipe_sampler_view base;
+   /* [0..7] = image descriptor
+    * [4..7] = buffer descriptor */
+   uint32_t state[8];
+   uint32_t fmask_state[8];
+   const struct legacy_surf_level *base_level_info;
+   ubyte base_level;
+   ubyte block_width;
+   bool is_stencil_sampler;
+   bool is_integer;
+   bool dcc_incompatible;
  };
  
  #define SI_SAMPLER_STATE_MAGIC 0x34f1c35a
  
  struct si_sampler_state {
  #ifndef NDEBUG
-       unsigned                        magic;
+   unsigned magic;
  #endif
-       uint32_t                        val[4];
-       uint32_t                        integer_val[4];
-       uint32_t                        upgraded_depth_val[4];
+   uint32_t val[4];
+   uint32_t integer_val[4];
+   uint32_t upgraded_depth_val[4];
  };
  
  struct si_cs_shader_state {
-       struct si_compute               *program;
-       struct si_compute               *emitted_program;
-       unsigned                        offset;
-       bool                            initialized;
-       bool                            uses_scratch;
+   struct si_compute *program;
+   struct si_compute *emitted_program;
+   unsigned offset;
+   bool initialized;
+   bool uses_scratch;
  };
  
  struct si_samplers {
-       struct pipe_sampler_view        *views[SI_NUM_SAMPLERS];
-       struct si_sampler_state         *sampler_states[SI_NUM_SAMPLERS];
+   struct pipe_sampler_view *views[SI_NUM_SAMPLERS];
+   struct si_sampler_state *sampler_states[SI_NUM_SAMPLERS];
  
-       /* The i-th bit is set if that element is enabled (non-NULL resource). */
-       unsigned                        enabled_mask;
-       uint32_t                        needs_depth_decompress_mask;
-       uint32_t                        needs_color_decompress_mask;
+   /* The i-th bit is set if that element is enabled (non-NULL resource). */
+   unsigned enabled_mask;
+   uint32_t needs_depth_decompress_mask;
+   uint32_t needs_color_decompress_mask;
  };
  
  struct si_images {
-       struct pipe_image_view          views[SI_NUM_IMAGES];
-       uint32_t                        needs_color_decompress_mask;
-       unsigned                        enabled_mask;
+   struct pipe_image_view views[SI_NUM_IMAGES];
+   uint32_t needs_color_decompress_mask;
+   unsigned enabled_mask;
  };
  
  struct si_framebuffer {
-       struct pipe_framebuffer_state   state;
-       unsigned                        colorbuf_enabled_4bit;
-       unsigned                        spi_shader_col_format;
-       unsigned                        spi_shader_col_format_alpha;
-       unsigned                        spi_shader_col_format_blend;
-       unsigned                        spi_shader_col_format_blend_alpha;
-       ubyte                           nr_samples:5; /* at most 16xAA */
-       ubyte                           log_samples:3; /* at most 4 = 16xAA */
-       ubyte                           nr_color_samples; /* at most 8xAA */
-       ubyte                           compressed_cb_mask;
-       ubyte                           uncompressed_cb_mask;
-       ubyte                           displayable_dcc_cb_mask;
-       ubyte                           color_is_int8;
-       ubyte                           color_is_int10;
-       ubyte                           dirty_cbufs;
-       ubyte                           dcc_overwrite_combiner_watermark;
-       ubyte                           min_bytes_per_pixel;
-       bool                            dirty_zsbuf;
-       bool                            any_dst_linear;
-       bool                            CB_has_shader_readable_metadata;
-       bool                            DB_has_shader_readable_metadata;
-       bool                            all_DCC_pipe_aligned;
+   struct pipe_framebuffer_state state;
+   unsigned colorbuf_enabled_4bit;
+   unsigned spi_shader_col_format;
+   unsigned spi_shader_col_format_alpha;
+   unsigned spi_shader_col_format_blend;
+   unsigned spi_shader_col_format_blend_alpha;
+   ubyte nr_samples : 5;   /* at most 16xAA */
+   ubyte log_samples : 3;  /* at most 4 = 16xAA */
+   ubyte nr_color_samples; /* at most 8xAA */
+   ubyte compressed_cb_mask;
+   ubyte uncompressed_cb_mask;
+   ubyte displayable_dcc_cb_mask;
+   ubyte color_is_int8;
+   ubyte color_is_int10;
+   ubyte dirty_cbufs;
+   ubyte dcc_overwrite_combiner_watermark;
+   ubyte min_bytes_per_pixel;
+   bool dirty_zsbuf;
+   bool any_dst_linear;
+   bool CB_has_shader_readable_metadata;
+   bool DB_has_shader_readable_metadata;
+   bool all_DCC_pipe_aligned;
  };
  
-enum si_quant_mode {
-       /* This is the list we want to support. */
-       SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH,
-       SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH,
-       SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH,
+enum si_quant_mode
+{
+   /* This is the list we want to support. */
+   SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH,
+   SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH,
+   SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH,
  };
  
  struct si_signed_scissor {
-       int minx;
-       int miny;
-       int maxx;
-       int maxy;
-       enum si_quant_mode quant_mode;
+   int minx;
+   int miny;
+   int maxx;
+   int maxy;
+   enum si_quant_mode quant_mode;
  };
  
  struct si_viewports {
-       struct pipe_viewport_state      states[SI_MAX_VIEWPORTS];
-       struct si_signed_scissor        as_scissor[SI_MAX_VIEWPORTS];
-       bool                            y_inverted;
+   struct pipe_viewport_state states[SI_MAX_VIEWPORTS];
+   struct si_signed_scissor as_scissor[SI_MAX_VIEWPORTS];
+   bool y_inverted;
  };
  
  struct si_clip_state {
-       struct pipe_clip_state          state;
-       bool                            any_nonzeros;
+   struct pipe_clip_state state;
+   bool any_nonzeros;
  };
  
  struct si_streamout_target {
-       struct pipe_stream_output_target b;
+   struct pipe_stream_output_target b;
  
-       /* The buffer where BUFFER_FILLED_SIZE is stored. */
-       struct si_resource      *buf_filled_size;
-       unsigned                buf_filled_size_offset;
-       bool                    buf_filled_size_valid;
+   /* The buffer where BUFFER_FILLED_SIZE is stored. */
+   struct si_resource *buf_filled_size;
+   unsigned buf_filled_size_offset;
+   bool buf_filled_size_valid;
  
-       unsigned                stride_in_dw;
+   unsigned stride_in_dw;
  };
  
  struct si_streamout {
-       bool                            begin_emitted;
+   bool begin_emitted;
  
-       unsigned                        enabled_mask;
-       unsigned                        num_targets;
-       struct si_streamout_target      *targets[PIPE_MAX_SO_BUFFERS];
+   unsigned enabled_mask;
+   unsigned num_targets;
+   struct si_streamout_target *targets[PIPE_MAX_SO_BUFFERS];
  
-       unsigned                        append_bitmask;
-       bool                            suspended;
+   unsigned append_bitmask;
+   bool suspended;
  
-       /* External state which comes from the vertex shader,
-        * it must be set explicitly when binding a shader. */
-       uint16_t                        *stride_in_dw;
-       unsigned                        enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */
+   /* External state which comes from the vertex shader,
+    * it must be set explicitly when binding a shader. */
+   uint16_t *stride_in_dw;
+   unsigned enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */
  
-       /* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */
-       unsigned                        hw_enabled_mask;
+   /* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */
+   unsigned hw_enabled_mask;
  
-       /* The state of VGT_STRMOUT_(CONFIG|EN). */
-       bool                            streamout_enabled;
-       bool                            prims_gen_query_enabled;
-       int                             num_prims_gen_queries;
+   /* The state of VGT_STRMOUT_(CONFIG|EN). */
+   bool streamout_enabled;
+   bool prims_gen_query_enabled;
+   int num_prims_gen_queries;
  };
  
  /* A shader state consists of the shader selector, which is a constant state
@@ -773,494 +773,488 @@ struct si_streamout {
   * the current shader variant selected for this context.
   */
  struct si_shader_ctx_state {
-       struct si_shader_selector       *cso;
-       struct si_shader                *current;
+   struct si_shader_selector *cso;
+   struct si_shader *current;
  };
  
  #define SI_NUM_VGT_PARAM_KEY_BITS 12
-#define SI_NUM_VGT_PARAM_STATES (1 << SI_NUM_VGT_PARAM_KEY_BITS)
+#define SI_NUM_VGT_PARAM_STATES   (1 << SI_NUM_VGT_PARAM_KEY_BITS)
  
  /* The IA_MULTI_VGT_PARAM key used to index the table of precomputed values.
   * Some fields are set by state-change calls, most are set by draw_vbo.
   */
  union si_vgt_param_key {
-       struct {
+   struct {
  #if UTIL_ARCH_LITTLE_ENDIAN
-               unsigned prim:4;
-               unsigned uses_instancing:1;
-               unsigned multi_instances_smaller_than_primgroup:1;
-               unsigned primitive_restart:1;
-               unsigned count_from_stream_output:1;
-               unsigned line_stipple_enabled:1;
-               unsigned uses_tess:1;
-               unsigned tess_uses_prim_id:1;
-               unsigned uses_gs:1;
-               unsigned _pad:32 - SI_NUM_VGT_PARAM_KEY_BITS;
+      unsigned prim : 4;
+      unsigned uses_instancing : 1;
+      unsigned multi_instances_smaller_than_primgroup : 1;
+      unsigned primitive_restart : 1;
+      unsigned count_from_stream_output : 1;
+      unsigned line_stipple_enabled : 1;
+      unsigned uses_tess : 1;
+      unsigned tess_uses_prim_id : 1;
+      unsigned uses_gs : 1;
+      unsigned _pad : 32 - SI_NUM_VGT_PARAM_KEY_BITS;
  #else /* UTIL_ARCH_BIG_ENDIAN */
-               unsigned _pad:32 - SI_NUM_VGT_PARAM_KEY_BITS;
-               unsigned uses_gs:1;
-               unsigned tess_uses_prim_id:1;
-               unsigned uses_tess:1;
-               unsigned line_stipple_enabled:1;
-               unsigned count_from_stream_output:1;
-               unsigned primitive_restart:1;
-               unsigned multi_instances_smaller_than_primgroup:1;
-               unsigned uses_instancing:1;
-               unsigned prim:4;
+      unsigned _pad : 32 - SI_NUM_VGT_PARAM_KEY_BITS;
+      unsigned uses_gs : 1;
+      unsigned tess_uses_prim_id : 1;
+      unsigned uses_tess : 1;
+      unsigned line_stipple_enabled : 1;
+      unsigned count_from_stream_output : 1;
+      unsigned primitive_restart : 1;
+      unsigned multi_instances_smaller_than_primgroup : 1;
+      unsigned uses_instancing : 1;
+      unsigned prim : 4;
  #endif
-       } u;
-       uint32_t index;
+   } u;
+   uint32_t index;
  };
  
  #define SI_NUM_VGT_STAGES_KEY_BITS 6
-#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS)
+#define SI_NUM_VGT_STAGES_STATES   (1 << SI_NUM_VGT_STAGES_KEY_BITS)
  
  /* The VGT_SHADER_STAGES key used to index the table of precomputed values.
   * Some fields are set by state-change calls, most are set by draw_vbo.
   */
  union si_vgt_stages_key {
-       struct {
+   struct {
  #if UTIL_ARCH_LITTLE_ENDIAN
-               unsigned tess:1;
-               unsigned gs:1;
-               unsigned ngg_gs_fast_launch:1;
-               unsigned ngg_passthrough:1;
-               unsigned ngg:1; /* gfx10+ */
-               unsigned streamout:1; /* only used with NGG */
-               unsigned _pad:32 - SI_NUM_VGT_STAGES_KEY_BITS;
+      unsigned tess : 1;
+      unsigned gs : 1;
+      unsigned ngg_gs_fast_launch : 1;
+      unsigned ngg_passthrough : 1;
+      unsigned ngg : 1;       /* gfx10+ */
+      unsigned streamout : 1; /* only used with NGG */
+      unsigned _pad : 32 - SI_NUM_VGT_STAGES_KEY_BITS;
  #else /* UTIL_ARCH_BIG_ENDIAN */
-               unsigned _pad:32 - SI_NUM_VGT_STAGES_KEY_BITS;
-               unsigned streamout:1;
-               unsigned ngg:1;
-               unsigned ngg_passthrough:1;
-               unsigned ngg_gs_fast_launch:1;
-               unsigned gs:1;
-               unsigned tess:1;
+      unsigned _pad : 32 - SI_NUM_VGT_STAGES_KEY_BITS;
+      unsigned streamout : 1;
+      unsigned ngg : 1;
+      unsigned ngg_passthrough : 1;
+      unsigned ngg_gs_fast_launch : 1;
+      unsigned gs : 1;
+      unsigned tess : 1;
  #endif
-       } u;
-       uint32_t index;
+   } u;
+   uint32_t index;
  };
  
-struct si_texture_handle
-{
-       unsigned                        desc_slot;
-       bool                            desc_dirty;
-       struct pipe_sampler_view        *view;
-       struct si_sampler_state         sstate;
+struct si_texture_handle {
+   unsigned desc_slot;
+   bool desc_dirty;
+   struct pipe_sampler_view *view;
+   struct si_sampler_state sstate;
  };
  
-struct si_image_handle
-{
-       unsigned                        desc_slot;
-       bool                            desc_dirty;
-       struct pipe_image_view          view;
+struct si_image_handle {
+   unsigned desc_slot;
+   bool desc_dirty;
+   struct pipe_image_view view;
  };
  
  struct si_saved_cs {
-       struct pipe_reference   reference;
-       struct si_context       *ctx;
-       struct radeon_saved_cs  gfx;
-       struct radeon_saved_cs  compute;
-       struct si_resource      *trace_buf;
-       unsigned                trace_id;
-
-       unsigned                gfx_last_dw;
-       unsigned                compute_last_dw;
-       bool                    flushed;
-       int64_t                 time_flush;
+   struct pipe_reference reference;
+   struct si_context *ctx;
+   struct radeon_saved_cs gfx;
+   struct radeon_saved_cs compute;
+   struct si_resource *trace_buf;
+   unsigned trace_id;
+
+   unsigned gfx_last_dw;
+   unsigned compute_last_dw;
+   bool flushed;
+   int64_t time_flush;
  };
  
  struct si_sdma_upload {
-       struct si_resource      *dst;
-       struct si_resource      *src;
-       unsigned                src_offset;
-       unsigned                dst_offset;
-       unsigned                size;
+   struct si_resource *dst;
+   struct si_resource *src;
+   unsigned src_offset;
+   unsigned dst_offset;
+   unsigned size;
  };
  
  struct si_small_prim_cull_info {
-       float scale[2], translate[2];
+   float scale[2], translate[2];
  };
  
  struct si_context {
-       struct pipe_context             b; /* base class */
-
-       enum radeon_family              family;
-       enum chip_class                 chip_class;
-
-       struct radeon_winsys            *ws;
-       struct radeon_winsys_ctx        *ctx;
-       struct radeon_cmdbuf            *gfx_cs; /* compute IB if graphics is disabled */
-       struct radeon_cmdbuf            *sdma_cs;
-       struct pipe_fence_handle        *last_gfx_fence;
-       struct pipe_fence_handle        *last_sdma_fence;
-       struct si_resource              *eop_bug_scratch;
-       struct u_upload_mgr             *cached_gtt_allocator;
-       struct threaded_context         *tc;
-       struct u_suballocator           *allocator_zeroed_memory;
-       struct slab_child_pool          pool_transfers;
-       struct slab_child_pool          pool_transfers_unsync; /* for threaded_context */
-       struct pipe_device_reset_callback device_reset_callback;
-       struct u_log_context            *log;
-       void                            *query_result_shader;
-       void                            *sh_query_result_shader;
-
-       void (*emit_cache_flush)(struct si_context *ctx);
-
-       struct blitter_context          *blitter;
-       void                            *noop_blend;
-       void                            *noop_dsa;
-       void                            *discard_rasterizer_state;
-       void                            *custom_dsa_flush;
-       void                            *custom_blend_resolve;
-       void                            *custom_blend_fmask_decompress;
-       void                            *custom_blend_eliminate_fastclear;
-       void                            *custom_blend_dcc_decompress;
-       void                            *vs_blit_pos;
-       void                            *vs_blit_pos_layered;
-       void                            *vs_blit_color;
-       void                            *vs_blit_color_layered;
-       void                            *vs_blit_texcoord;
-       void                            *cs_clear_buffer;
-       void                            *cs_copy_buffer;
-       void                            *cs_copy_image;
-       void                            *cs_copy_image_1d_array;
-       void                            *cs_clear_render_target;
-       void                            *cs_clear_render_target_1d_array;
-       void                            *cs_clear_12bytes_buffer;
-       void                            *cs_dcc_retile;
-       void                            *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
-       struct si_screen                *screen;
-       struct pipe_debug_callback      debug;
-       struct ac_llvm_compiler         compiler; /* only non-threaded compilation */
-       struct si_shader_ctx_state      fixed_func_tcs_shader;
-       /* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */
-       struct si_resource              *wait_mem_scratch;
-       unsigned                        wait_mem_number;
-       uint16_t                        prefetch_L2_mask;
-
-       bool                            has_graphics;
-       bool                            gfx_flush_in_progress:1;
-       bool                            gfx_last_ib_is_busy:1;
-       bool                            compute_is_busy:1;
-
-       unsigned                        num_gfx_cs_flushes;
-       unsigned                        initial_gfx_cs_size;
-       unsigned                        last_dirty_tex_counter;
-       unsigned                        last_dirty_buf_counter;
-       unsigned                        last_compressed_colortex_counter;
-       unsigned                        last_num_draw_calls;
-       unsigned                        flags; /* flush flags */
-       /* Current unaccounted memory usage. */
-       uint64_t                        vram;
-       uint64_t                        gtt;
-
-       /* Compute-based primitive discard. */
-       unsigned                        prim_discard_vertex_count_threshold;
-       struct pb_buffer                *gds;
-       struct pb_buffer                *gds_oa;
-       struct radeon_cmdbuf            *prim_discard_compute_cs;
-       unsigned                        compute_gds_offset;
-       struct si_shader                *compute_ib_last_shader;
-       uint32_t                        compute_rewind_va;
-       unsigned                        compute_num_prims_in_batch;
-       bool                            preserve_prim_restart_gds_at_flush;
-       /* index_ring is divided into 2 halves for doublebuffering. */
-       struct si_resource              *index_ring;
-       unsigned                        index_ring_base; /* offset of a per-IB portion */
-       unsigned                        index_ring_offset; /* offset within a per-IB portion */
-       unsigned                        index_ring_size_per_ib; /* max available size per IB */
-       bool                            prim_discard_compute_ib_initialized;
-       /* For tracking the last execution barrier - it can be either
-        * a WRITE_DATA packet or a fence. */
-       uint32_t                        *last_pkt3_write_data;
-       struct si_resource              *barrier_buf;
-       unsigned                        barrier_buf_offset;
-       struct pipe_fence_handle        *last_ib_barrier_fence;
-       struct si_resource              *last_ib_barrier_buf;
-       unsigned                        last_ib_barrier_buf_offset;
-
-       /* Atoms (direct states). */
-       union si_state_atoms            atoms;
-       unsigned                        dirty_atoms; /* mask */
-       /* PM4 states (precomputed immutable states) */
-       unsigned                        dirty_states;
-       union si_state                  queued;
-       union si_state                  emitted;
-
-       /* Atom declarations. */
-       struct si_framebuffer           framebuffer;
-       unsigned                        sample_locs_num_samples;
-       uint16_t                        sample_mask;
-       unsigned                        last_cb_target_mask;
-       struct si_blend_color           blend_color;
-       struct si_clip_state            clip_state;
-       struct si_shader_data           shader_pointers;
-       struct si_stencil_ref           stencil_ref;
-       struct pipe_scissor_state       scissors[SI_MAX_VIEWPORTS];
-       struct si_streamout             streamout;
-       struct si_viewports             viewports;
-       unsigned                        num_window_rectangles;
-       bool                            window_rectangles_include;
-       struct pipe_scissor_state       window_rectangles[4];
-
-       /* Precomputed states. */
-       struct si_pm4_state             *init_config;
-       struct si_pm4_state             *init_config_gs_rings;
-       bool                            init_config_has_vgt_flush;
-       struct si_pm4_state             *vgt_shader_config[SI_NUM_VGT_STAGES_STATES];
-
-       /* shaders */
-       struct si_shader_ctx_state      ps_shader;
-       struct si_shader_ctx_state      gs_shader;
-       struct si_shader_ctx_state      vs_shader;
-       struct si_shader_ctx_state      tcs_shader;
-       struct si_shader_ctx_state      tes_shader;
-       struct si_shader_ctx_state      cs_prim_discard_state;
-       struct si_cs_shader_state       cs_shader_state;
-
-       /* shader information */
-       struct si_vertex_elements       *vertex_elements;
-       unsigned                        num_vertex_elements;
-       unsigned                        sprite_coord_enable;
-       unsigned                        cs_max_waves_per_sh;
-       bool                            flatshade;
-       bool                            do_update_shaders;
-
-       /* shader descriptors */
-       struct si_descriptors           descriptors[SI_NUM_DESCS];
-       unsigned                        descriptors_dirty;
-       unsigned                        shader_pointers_dirty;
-       unsigned                        shader_needs_decompress_mask;
-       struct si_buffer_resources      rw_buffers;
-       struct si_buffer_resources      const_and_shader_buffers[SI_NUM_SHADERS];
-       struct si_samplers              samplers[SI_NUM_SHADERS];
-       struct si_images                images[SI_NUM_SHADERS];
-       bool                            bo_list_add_all_resident_resources;
-       bool                            bo_list_add_all_gfx_resources;
-       bool                            bo_list_add_all_compute_resources;
-
-       /* other shader resources */
-       struct pipe_constant_buffer     null_const_buf; /* used for set_constant_buffer(NULL) on GFX7 */
-       struct pipe_resource            *esgs_ring;
-       struct pipe_resource            *gsvs_ring;
-       struct pipe_resource            *tess_rings;
-       union pipe_color_union          *border_color_table; /* in CPU memory, any endian */
-       struct si_resource              *border_color_buffer;
-       union pipe_color_union          *border_color_map; /* in VRAM (slow access), little endian */
-       unsigned                        border_color_count;
-       unsigned                        num_vs_blit_sgprs;
-       uint32_t                        vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD];
-       uint32_t                        cs_user_data[4];
-
-       /* Vertex buffers. */
-       bool                            vertex_buffers_dirty;
-       bool                            vertex_buffer_pointer_dirty;
-       bool                            vertex_buffer_user_sgprs_dirty;
-       struct pipe_vertex_buffer       vertex_buffer[SI_NUM_VERTEX_BUFFERS];
-       uint16_t                        vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */
-       uint32_t                        *vb_descriptors_gpu_list;
-       struct si_resource              *vb_descriptors_buffer;
-       unsigned                        vb_descriptors_offset;
-       unsigned                        vb_descriptor_user_sgprs[5*4];
-
-       /* MSAA config state. */
-       int                             ps_iter_samples;
-       bool                            ps_uses_fbfetch;
-       bool                            smoothing_enabled;
-
-       /* DB render state. */
-       unsigned                ps_db_shader_control;
-       unsigned                dbcb_copy_sample;
-       bool                    dbcb_depth_copy_enabled:1;
-       bool                    dbcb_stencil_copy_enabled:1;
-       bool                    db_flush_depth_inplace:1;
-       bool                    db_flush_stencil_inplace:1;
-       bool                    db_depth_clear:1;
-       bool                    db_depth_disable_expclear:1;
-       bool                    db_stencil_clear:1;
-       bool                    db_stencil_disable_expclear:1;
-       bool                    occlusion_queries_disabled:1;
-       bool                    generate_mipmap_for_depth:1;
-
-       /* Emitted draw state. */
-       bool                    gs_tri_strip_adj_fix:1;
-       bool                    ls_vgpr_fix:1;
-       bool                    prim_discard_cs_instancing:1;
-       bool                    ngg:1;
-       uint8_t                 ngg_culling;
-       int                     last_index_size;
-       int                     last_base_vertex;
-       int                     last_start_instance;
-       int                     last_instance_count;
-       int                     last_drawid;
-       int                     last_sh_base_reg;
-       int                     last_primitive_restart_en;
-       int                     last_restart_index;
-       int                     last_prim;
-       int                     last_multi_vgt_param;
-       int                     last_gs_out_prim;
-       int                     last_binning_enabled;
-       unsigned                current_vs_state;
-       unsigned                last_vs_state;
-       enum pipe_prim_type     current_rast_prim; /* primitive type after TES, GS */
-
-       struct si_small_prim_cull_info last_small_prim_cull_info;
-       struct si_resource      *small_prim_cull_info_buf;
-       uint64_t                small_prim_cull_info_address;
-       bool                    small_prim_cull_info_dirty;
-
-       /* Scratch buffer */
-       struct si_resource      *scratch_buffer;
-       unsigned                scratch_waves;
-       unsigned                spi_tmpring_size;
-       unsigned                max_seen_scratch_bytes_per_wave;
-       unsigned                max_seen_compute_scratch_bytes_per_wave;
-
-       struct si_resource      *compute_scratch_buffer;
-
-       /* Emitted derived tessellation state. */
-       /* Local shader (VS), or HS if LS-HS are merged. */
-       struct si_shader        *last_ls;
-       struct si_shader_selector *last_tcs;
-       int                     last_num_tcs_input_cp;
-       int                     last_tes_sh_base;
-       bool                    last_tess_uses_primid;
-       unsigned                last_num_patches;
-       int                     last_ls_hs_config;
-
-       /* Debug state. */
-       bool                    is_debug;
-       struct si_saved_cs      *current_saved_cs;
-       uint64_t                dmesg_timestamp;
-       unsigned                apitrace_call_number;
-
-       /* Other state */
-       bool need_check_render_feedback;
-       bool                    decompression_enabled;
-       bool                    dpbb_force_off;
-       bool                    vs_writes_viewport_index;
-       bool                    vs_disables_clipping_viewport;
-
-       /* Precomputed IA_MULTI_VGT_PARAM */
-       union si_vgt_param_key  ia_multi_vgt_param_key;
-       unsigned                ia_multi_vgt_param[SI_NUM_VGT_PARAM_STATES];
-
-       /* Bindless descriptors. */
-       struct si_descriptors   bindless_descriptors;
-       struct util_idalloc     bindless_used_slots;
-       unsigned                num_bindless_descriptors;
-       bool                    bindless_descriptors_dirty;
-       bool                    graphics_bindless_pointer_dirty;
-       bool                    compute_bindless_pointer_dirty;
-
-       /* Allocated bindless handles */
-       struct hash_table       *tex_handles;
-       struct hash_table       *img_handles;
-
-       /* Resident bindless handles */
-       struct util_dynarray    resident_tex_handles;
-       struct util_dynarray    resident_img_handles;
-
-       /* Resident bindless handles which need decompression */
-       struct util_dynarray    resident_tex_needs_color_decompress;
-       struct util_dynarray    resident_img_needs_color_decompress;
-       struct util_dynarray    resident_tex_needs_depth_decompress;
-
-       /* Bindless state */
-       bool                    uses_bindless_samplers;
-       bool                    uses_bindless_images;
-
-       /* MSAA sample locations.
-        * The first index is the sample index.
-        * The second index is the coordinate: X, Y. */
-       struct {
-               float                   x1[1][2];
-               float                   x2[2][2];
-               float                   x4[4][2];
-               float                   x8[8][2];
-               float                   x16[16][2];
-       } sample_positions;
-       struct pipe_resource *sample_pos_buffer;
-
-       /* Misc stats. */
-       unsigned                        num_draw_calls;
-       unsigned                        num_decompress_calls;
-       unsigned                        num_mrt_draw_calls;
-       unsigned                        num_prim_restart_calls;
-       unsigned                        num_spill_draw_calls;
-       unsigned                        num_compute_calls;
-       unsigned                        num_spill_compute_calls;
-       unsigned                        num_dma_calls;
-       unsigned                        num_cp_dma_calls;
-       unsigned                        num_vs_flushes;
-       unsigned                        num_ps_flushes;
-       unsigned                        num_cs_flushes;
-       unsigned                        num_cb_cache_flushes;
-       unsigned                        num_db_cache_flushes;
-       unsigned                        num_L2_invalidates;
-       unsigned                        num_L2_writebacks;
-       unsigned                        num_resident_handles;
-       uint64_t                        num_alloc_tex_transfer_bytes;
-       unsigned                        last_tex_ps_draw_ratio; /* for query */
-       unsigned                        compute_num_verts_accepted;
-       unsigned                        compute_num_verts_rejected;
-       unsigned                        compute_num_verts_ineligible; /* due to low vertex count */
-       unsigned                        context_roll;
-
-       /* Queries. */
-       /* Maintain the list of active queries for pausing between IBs. */
-       int                             num_occlusion_queries;
-       int                             num_perfect_occlusion_queries;
-       int                             num_pipeline_stat_queries;
-       struct list_head                active_queries;
-       unsigned                        num_cs_dw_queries_suspend;
-
-       /* Render condition. */
-       struct pipe_query               *render_cond;
-       unsigned                        render_cond_mode;
-       bool                            render_cond_invert;
-       bool                            render_cond_force_off; /* for u_blitter */
-
-       /* For uploading data via GTT and copy to VRAM on context flush via SDMA. */
-       bool                            sdma_uploads_in_progress;
-       struct si_sdma_upload           *sdma_uploads;
-       unsigned                        num_sdma_uploads;
-       unsigned                        max_sdma_uploads;
-
-       /* Shader-based queries. */
-       struct list_head                shader_query_buffers;
-       unsigned                        num_active_shader_queries;
-
-       /* Statistics gathering for the DCC enablement heuristic. It can't be
-        * in si_texture because si_texture can be shared by multiple
-        * contexts. This is for back buffers only. We shouldn't get too many
-        * of those.
-        *
-        * X11 DRI3 rotates among a finite set of back buffers. They should
-        * all fit in this array. If they don't, separate DCC might never be
-        * enabled by DCC stat gathering.
-        */
-       struct {
-               struct si_texture               *tex;
-               /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
-               struct pipe_query               *ps_stats[3];
-               /* If all slots are used and another slot is needed,
-                * the least recently used slot is evicted based on this. */
-               int64_t                         last_use_timestamp;
-               bool                            query_active;
-       } dcc_stats[5];
-
-       /* Copy one resource to another using async DMA. */
-       void (*dma_copy)(struct pipe_context *ctx,
-                        struct pipe_resource *dst,
-                        unsigned dst_level,
-                        unsigned dst_x, unsigned dst_y, unsigned dst_z,
-                        struct pipe_resource *src,
-                        unsigned src_level,
-                        const struct pipe_box *src_box);
-
-       struct si_tracked_regs                  tracked_regs;
+   struct pipe_context b; /* base class */
+
+   enum radeon_family family;
+   enum chip_class chip_class;
+
+   struct radeon_winsys *ws;
+   struct radeon_winsys_ctx *ctx;
+   struct radeon_cmdbuf *gfx_cs; /* compute IB if graphics is disabled */
+   struct radeon_cmdbuf *sdma_cs;
+   struct pipe_fence_handle *last_gfx_fence;
+   struct pipe_fence_handle *last_sdma_fence;
+   struct si_resource *eop_bug_scratch;
+   struct u_upload_mgr *cached_gtt_allocator;
+   struct threaded_context *tc;
+   struct u_suballocator *allocator_zeroed_memory;
+   struct slab_child_pool pool_transfers;
+   struct slab_child_pool pool_transfers_unsync; /* for threaded_context */
+   struct pipe_device_reset_callback device_reset_callback;
+   struct u_log_context *log;
+   void *query_result_shader;
+   void *sh_query_result_shader;
+
+   void (*emit_cache_flush)(struct si_context *ctx);
+
+   struct blitter_context *blitter;
+   void *noop_blend;
+   void *noop_dsa;
+   void *discard_rasterizer_state;
+   void *custom_dsa_flush;
+   void *custom_blend_resolve;
+   void *custom_blend_fmask_decompress;
+   void *custom_blend_eliminate_fastclear;
+   void *custom_blend_dcc_decompress;
+   void *vs_blit_pos;
+   void *vs_blit_pos_layered;
+   void *vs_blit_color;
+   void *vs_blit_color_layered;
+   void *vs_blit_texcoord;
+   void *cs_clear_buffer;
+   void *cs_copy_buffer;
+   void *cs_copy_image;
+   void *cs_copy_image_1d_array;
+   void *cs_clear_render_target;
+   void *cs_clear_render_target_1d_array;
+   void *cs_clear_12bytes_buffer;
+   void *cs_dcc_retile;
+   void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
+   struct si_screen *screen;
+   struct pipe_debug_callback debug;
+   struct ac_llvm_compiler compiler; /* only non-threaded compilation */
+   struct si_shader_ctx_state fixed_func_tcs_shader;
+   /* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */
+   struct si_resource *wait_mem_scratch;
+   unsigned wait_mem_number;
+   uint16_t prefetch_L2_mask;
+
+   bool has_graphics;
+   bool gfx_flush_in_progress : 1;
+   bool gfx_last_ib_is_busy : 1;
+   bool compute_is_busy : 1;
+
+   unsigned num_gfx_cs_flushes;
+   unsigned initial_gfx_cs_size;
+   unsigned last_dirty_tex_counter;
+   unsigned last_dirty_buf_counter;
+   unsigned last_compressed_colortex_counter;
+   unsigned last_num_draw_calls;
+   unsigned flags; /* flush flags */
+   /* Current unaccounted memory usage. */
+   uint64_t vram;
+   uint64_t gtt;
+
+   /* Compute-based primitive discard. */
+   unsigned prim_discard_vertex_count_threshold;
+   struct pb_buffer *gds;
+   struct pb_buffer *gds_oa;
+   struct radeon_cmdbuf *prim_discard_compute_cs;
+   unsigned compute_gds_offset;
+   struct si_shader *compute_ib_last_shader;
+   uint32_t compute_rewind_va;
+   unsigned compute_num_prims_in_batch;
+   bool preserve_prim_restart_gds_at_flush;
+   /* index_ring is divided into 2 halves for doublebuffering. */
+   struct si_resource *index_ring;
+   unsigned index_ring_base;        /* offset of a per-IB portion */
+   unsigned index_ring_offset;      /* offset within a per-IB portion */
+   unsigned index_ring_size_per_ib; /* max available size per IB */
+   bool prim_discard_compute_ib_initialized;
+   /* For tracking the last execution barrier - it can be either
+    * a WRITE_DATA packet or a fence. */
+   uint32_t *last_pkt3_write_data;
+   struct si_resource *barrier_buf;
+   unsigned barrier_buf_offset;
+   struct pipe_fence_handle *last_ib_barrier_fence;
+   struct si_resource *last_ib_barrier_buf;
+   unsigned last_ib_barrier_buf_offset;
+
+   /* Atoms (direct states). */
+   union si_state_atoms atoms;
+   unsigned dirty_atoms; /* mask */
+   /* PM4 states (precomputed immutable states) */
+   unsigned dirty_states;
+   union si_state queued;
+   union si_state emitted;
+
+   /* Atom declarations. */
+   struct si_framebuffer framebuffer;
+   unsigned sample_locs_num_samples;
+   uint16_t sample_mask;
+   unsigned last_cb_target_mask;
+   struct si_blend_color blend_color;
+   struct si_clip_state clip_state;
+   struct si_shader_data shader_pointers;
+   struct si_stencil_ref stencil_ref;
+   struct pipe_scissor_state scissors[SI_MAX_VIEWPORTS];
+   struct si_streamout streamout;
+   struct si_viewports viewports;
+   unsigned num_window_rectangles;
+   bool window_rectangles_include;
+   struct pipe_scissor_state window_rectangles[4];
+
+   /* Precomputed states. */
+   struct si_pm4_state *init_config;
+   struct si_pm4_state *init_config_gs_rings;
+   bool init_config_has_vgt_flush;
+   struct si_pm4_state *vgt_shader_config[SI_NUM_VGT_STAGES_STATES];
+
+   /* shaders */
+   struct si_shader_ctx_state ps_shader;
+   struct si_shader_ctx_state gs_shader;
+   struct si_shader_ctx_state vs_shader;
+   struct si_shader_ctx_state tcs_shader;
+   struct si_shader_ctx_state tes_shader;
+   struct si_shader_ctx_state cs_prim_discard_state;
+   struct si_cs_shader_state cs_shader_state;
+
+   /* shader information */
+   struct si_vertex_elements *vertex_elements;
+   unsigned num_vertex_elements;
+   unsigned sprite_coord_enable;
+   unsigned cs_max_waves_per_sh;
+   bool flatshade;
+   bool do_update_shaders;
+
+   /* shader descriptors */
+   struct si_descriptors descriptors[SI_NUM_DESCS];
+   unsigned descriptors_dirty;
+   unsigned shader_pointers_dirty;
+   unsigned shader_needs_decompress_mask;
+   struct si_buffer_resources rw_buffers;
+   struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS];
+   struct si_samplers samplers[SI_NUM_SHADERS];
+   struct si_images images[SI_NUM_SHADERS];
+   bool bo_list_add_all_resident_resources;
+   bool bo_list_add_all_gfx_resources;
+   bool bo_list_add_all_compute_resources;
+
+   /* other shader resources */
+   struct pipe_constant_buffer null_const_buf; /* used for set_constant_buffer(NULL) on GFX7 */
+   struct pipe_resource *esgs_ring;
+   struct pipe_resource *gsvs_ring;
+   struct pipe_resource *tess_rings;
+   union pipe_color_union *border_color_table; /* in CPU memory, any endian */
+   struct si_resource *border_color_buffer;
+   union pipe_color_union *border_color_map; /* in VRAM (slow access), little endian */
+   unsigned border_color_count;
+   unsigned num_vs_blit_sgprs;
+   uint32_t vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD];
+   uint32_t cs_user_data[4];
+
+   /* Vertex buffers. */
+   bool vertex_buffers_dirty;
+   bool vertex_buffer_pointer_dirty;
+   bool vertex_buffer_user_sgprs_dirty;
+   struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS];
+   uint16_t vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */
+   uint32_t *vb_descriptors_gpu_list;
+   struct si_resource *vb_descriptors_buffer;
+   unsigned vb_descriptors_offset;
+   unsigned vb_descriptor_user_sgprs[5 * 4];
+
+   /* MSAA config state. */
+   int ps_iter_samples;
+   bool ps_uses_fbfetch;
+   bool smoothing_enabled;
+
+   /* DB render state. */
+   unsigned ps_db_shader_control;
+   unsigned dbcb_copy_sample;
+   bool dbcb_depth_copy_enabled : 1;
+   bool dbcb_stencil_copy_enabled : 1;
+   bool db_flush_depth_inplace : 1;
+   bool db_flush_stencil_inplace : 1;
+   bool db_depth_clear : 1;
+   bool db_depth_disable_expclear : 1;
+   bool db_stencil_clear : 1;
+   bool db_stencil_disable_expclear : 1;
+   bool occlusion_queries_disabled : 1;
+   bool generate_mipmap_for_depth : 1;
+
+   /* Emitted draw state. */
+   bool gs_tri_strip_adj_fix : 1;
+   bool ls_vgpr_fix : 1;
+   bool prim_discard_cs_instancing : 1;
+   bool ngg : 1;
+   uint8_t ngg_culling;
+   int last_index_size;
+   int last_base_vertex;
+   int last_start_instance;
+   int last_instance_count;
+   int last_drawid;
+   int last_sh_base_reg;
+   int last_primitive_restart_en;
+   int last_restart_index;
+   int last_prim;
+   int last_multi_vgt_param;
+   int last_gs_out_prim;
+   int last_binning_enabled;
+   unsigned current_vs_state;
+   unsigned last_vs_state;
+   enum pipe_prim_type current_rast_prim; /* primitive type after TES, GS */
+
+   struct si_small_prim_cull_info last_small_prim_cull_info;
+   struct si_resource *small_prim_cull_info_buf;
+   uint64_t small_prim_cull_info_address;
+   bool small_prim_cull_info_dirty;
+
+   /* Scratch buffer */
+   struct si_resource *scratch_buffer;
+   unsigned scratch_waves;
+   unsigned spi_tmpring_size;
+   unsigned max_seen_scratch_bytes_per_wave;
+   unsigned max_seen_compute_scratch_bytes_per_wave;
+
+   struct si_resource *compute_scratch_buffer;
+
+   /* Emitted derived tessellation state. */
+   /* Local shader (VS), or HS if LS-HS are merged. */
+   struct si_shader *last_ls;
+   struct si_shader_selector *last_tcs;
+   int last_num_tcs_input_cp;
+   int last_tes_sh_base;
+   bool last_tess_uses_primid;
+   unsigned last_num_patches;
+   int last_ls_hs_config;
+
+   /* Debug state. */
+   bool is_debug;
+   struct si_saved_cs *current_saved_cs;
+   uint64_t dmesg_timestamp;
+   unsigned apitrace_call_number;
+
+   /* Other state */
+   bool need_check_render_feedback;
+   bool decompression_enabled;
+   bool dpbb_force_off;
+   bool vs_writes_viewport_index;
+   bool vs_disables_clipping_viewport;
+
+   /* Precomputed IA_MULTI_VGT_PARAM */
+   union si_vgt_param_key ia_multi_vgt_param_key;
+   unsigned ia_multi_vgt_param[SI_NUM_VGT_PARAM_STATES];
+
+   /* Bindless descriptors. */
+   struct si_descriptors bindless_descriptors;
+   struct util_idalloc bindless_used_slots;
+   unsigned num_bindless_descriptors;
+   bool bindless_descriptors_dirty;
+   bool graphics_bindless_pointer_dirty;
+   bool compute_bindless_pointer_dirty;
+
+   /* Allocated bindless handles */
+   struct hash_table *tex_handles;
+   struct hash_table *img_handles;
+
+   /* Resident bindless handles */
+   struct util_dynarray resident_tex_handles;
+   struct util_dynarray resident_img_handles;
+
+   /* Resident bindless handles which need decompression */
+   struct util_dynarray resident_tex_needs_color_decompress;
+   struct util_dynarray resident_img_needs_color_decompress;
+   struct util_dynarray resident_tex_needs_depth_decompress;
+
+   /* Bindless state */
+   bool uses_bindless_samplers;
+   bool uses_bindless_images;
+
+   /* MSAA sample locations.
+    * The first index is the sample index.
+    * The second index is the coordinate: X, Y. */
+   struct {
+      float x1[1][2];
+      float x2[2][2];
+      float x4[4][2];
+      float x8[8][2];
+      float x16[16][2];
+   } sample_positions;
+   struct pipe_resource *sample_pos_buffer;
+
+   /* Misc stats. */
+   unsigned num_draw_calls;
+   unsigned num_decompress_calls;
+   unsigned num_mrt_draw_calls;
+   unsigned num_prim_restart_calls;
+   unsigned num_spill_draw_calls;
+   unsigned num_compute_calls;
+   unsigned num_spill_compute_calls;
+   unsigned num_dma_calls;
+   unsigned num_cp_dma_calls;
+   unsigned num_vs_flushes;
+   unsigned num_ps_flushes;
+   unsigned num_cs_flushes;
+   unsigned num_cb_cache_flushes;
+   unsigned num_db_cache_flushes;
+   unsigned num_L2_invalidates;
+   unsigned num_L2_writebacks;
+   unsigned num_resident_handles;
+   uint64_t num_alloc_tex_transfer_bytes;
+   unsigned last_tex_ps_draw_ratio; /* for query */
+   unsigned compute_num_verts_accepted;
+   unsigned compute_num_verts_rejected;
+   unsigned compute_num_verts_ineligible; /* due to low vertex count */
+   unsigned context_roll;
+
+   /* Queries. */
+   /* Maintain the list of active queries for pausing between IBs. */
+   int num_occlusion_queries;
+   int num_perfect_occlusion_queries;
+   int num_pipeline_stat_queries;
+   struct list_head active_queries;
+   unsigned num_cs_dw_queries_suspend;
+
+   /* Render condition. */
+   struct pipe_query *render_cond;
+   unsigned render_cond_mode;
+   bool render_cond_invert;
+   bool render_cond_force_off; /* for u_blitter */
+
+   /* For uploading data via GTT and copy to VRAM on context flush via SDMA. */
+   bool sdma_uploads_in_progress;
+   struct si_sdma_upload *sdma_uploads;
+   unsigned num_sdma_uploads;
+   unsigned max_sdma_uploads;
+
+   /* Shader-based queries. */
+   struct list_head shader_query_buffers;
+   unsigned num_active_shader_queries;
+
+   /* Statistics gathering for the DCC enablement heuristic. It can't be
+    * in si_texture because si_texture can be shared by multiple
+    * contexts. This is for back buffers only. We shouldn't get too many
+    * of those.
+    *
+    * X11 DRI3 rotates among a finite set of back buffers. They should
+    * all fit in this array. If they don't, separate DCC might never be
+    * enabled by DCC stat gathering.
+    */
+   struct {
+      struct si_texture *tex;
+      /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
+      struct pipe_query *ps_stats[3];
+      /* If all slots are used and another slot is needed,
+       * the least recently used slot is evicted based on this. */
+      int64_t last_use_timestamp;
+      bool query_active;
+   } dcc_stats[5];
+
+   /* Copy one resource to another using async DMA. */
+   void (*dma_copy)(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level,
+                    unsigned dst_x, unsigned dst_y, unsigned dst_z, struct pipe_resource *src,
+                    unsigned src_level, const struct pipe_box *src_box);
+
+   struct si_tracked_regs tracked_regs;
  };
  
  /* cik_sdma.c */
@@ -1269,124 +1263,99 @@ void cik_init_sdma_functions(struct si_context *sctx);
  /* si_blit.c */
  enum si_blitter_op /* bitmask */
  {
-       SI_SAVE_TEXTURES      = 1,
-       SI_SAVE_FRAMEBUFFER   = 2,
-       SI_SAVE_FRAGMENT_STATE = 4,
-       SI_DISABLE_RENDER_COND = 8,
+   SI_SAVE_TEXTURES = 1,
+   SI_SAVE_FRAMEBUFFER = 2,
+   SI_SAVE_FRAGMENT_STATE = 4,
+   SI_DISABLE_RENDER_COND = 8,
  };
  
  void si_blitter_begin(struct si_context *sctx, enum si_blitter_op op);
  void si_blitter_end(struct si_context *sctx);
  void si_init_blit_functions(struct si_context *sctx);
  void si_decompress_textures(struct si_context *sctx, unsigned shader_mask);
-void si_decompress_subresource(struct pipe_context *ctx,
-                              struct pipe_resource *tex,
-                              unsigned planes, unsigned level,
-                              unsigned first_layer, unsigned last_layer);
-void si_resource_copy_region(struct pipe_context *ctx,
-                            struct pipe_resource *dst,
-                            unsigned dst_level,
-                            unsigned dstx, unsigned dsty, unsigned dstz,
-                            struct pipe_resource *src,
-                            unsigned src_level,
-                            const struct pipe_box *src_box);
+void si_decompress_subresource(struct pipe_context *ctx, struct pipe_resource *tex, unsigned planes,
+                               unsigned level, unsigned first_layer, unsigned last_layer);
+void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst,
+                             unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
+                             struct pipe_resource *src, unsigned src_level,
+                             const struct pipe_box *src_box);
  void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex);
  
  /* si_buffer.c */
-bool si_rings_is_buffer_referenced(struct si_context *sctx,
-                                  struct pb_buffer *buf,
-                                  enum radeon_bo_usage usage);
-void *si_buffer_map_sync_with_rings(struct si_context *sctx,
-                                   struct si_resource *resource,
-                                   unsigned usage);
-void si_init_resource_fields(struct si_screen *sscreen,
-                            struct si_resource *res,
-                            uint64_t size, unsigned alignment);
-bool si_alloc_resource(struct si_screen *sscreen,
-                      struct si_resource *res);
-struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen,
-                                                unsigned flags, unsigned usage,
-                                                unsigned size, unsigned alignment);
-struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen,
-                                              unsigned flags, unsigned usage,
-                                              unsigned size, unsigned alignment);
-void si_replace_buffer_storage(struct pipe_context *ctx,
-                              struct pipe_resource *dst,
-                              struct pipe_resource *src);
+bool si_rings_is_buffer_referenced(struct si_context *sctx, struct pb_buffer *buf,
+                                   enum radeon_bo_usage usage);
+void *si_buffer_map_sync_with_rings(struct si_context *sctx, struct si_resource *resource,
+                                    unsigned usage);
+void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res, uint64_t size,
+                             unsigned alignment);
+bool si_alloc_resource(struct si_screen *sscreen, struct si_resource *res);
+struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
+                                                 unsigned usage, unsigned size, unsigned alignment);
+struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
+                                             unsigned usage, unsigned size, unsigned alignment);
+void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst,
+                               struct pipe_resource *src);
  void si_init_screen_buffer_functions(struct si_screen *sscreen);
  void si_init_buffer_functions(struct si_context *sctx);
  
  /* si_clear.c */
  enum pipe_format si_simplify_cb_format(enum pipe_format format);
  bool vi_alpha_is_on_msb(struct si_screen *sscreen, enum pipe_format format);
-bool vi_dcc_clear_level(struct si_context *sctx,
-                       struct si_texture *tex,
-                       unsigned level, unsigned clear_value);
+bool vi_dcc_clear_level(struct si_context *sctx, struct si_texture *tex, unsigned level,
+                        unsigned clear_value);
  void si_init_clear_functions(struct si_context *sctx);
  
  /* si_compute_blit.c */
  unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
-                           enum si_cache_policy cache_policy);
-void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
-                    uint64_t offset, uint64_t size, uint32_t *clear_value,
-                    uint32_t clear_value_size, enum si_coherency coher,
-                    bool force_cpdma);
-void si_copy_buffer(struct si_context *sctx,
-                   struct pipe_resource *dst, struct pipe_resource *src,
-                   uint64_t dst_offset, uint64_t src_offset, unsigned size);
-void si_compute_copy_image(struct si_context *sctx,
-                          struct pipe_resource *dst,
-                          unsigned dst_level,
-                          struct pipe_resource *src,
-                          unsigned src_level,
-                          unsigned dstx, unsigned dsty, unsigned dstz,
-                          const struct pipe_box *src_box);
-void si_compute_clear_render_target(struct pipe_context *ctx,
-                                    struct pipe_surface *dstsurf,
-                                    const union pipe_color_union *color,
-                                    unsigned dstx, unsigned dsty,
-                                    unsigned width, unsigned height,
-                                   bool render_condition_enabled);
+                            enum si_cache_policy cache_policy);
+void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
+                     uint64_t size, uint32_t *clear_value, uint32_t clear_value_size,
+                     enum si_coherency coher, bool force_cpdma);
+void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
+                    uint64_t dst_offset, uint64_t src_offset, unsigned size);
+void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
+                           struct pipe_resource *src, unsigned src_level, unsigned dstx,
+                           unsigned dsty, unsigned dstz, const struct pipe_box *src_box);
+void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf,
+                                    const union pipe_color_union *color, unsigned dstx,
+                                    unsigned dsty, unsigned width, unsigned height,
+                                    bool render_condition_enabled);
  void si_retile_dcc(struct si_context *sctx, struct si_texture *tex);
  void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex);
  void si_init_compute_blit_functions(struct si_context *sctx);
  
  /* si_cp_dma.c */
-#define SI_CPDMA_SKIP_CHECK_CS_SPACE   (1 << 0) /* don't call need_cs_space */
-#define SI_CPDMA_SKIP_SYNC_AFTER       (1 << 1) /* don't wait for DMA after the copy */
-#define SI_CPDMA_SKIP_SYNC_BEFORE      (1 << 2) /* don't wait for DMA before the copy (RAW hazards) */
-#define SI_CPDMA_SKIP_GFX_SYNC         (1 << 3) /* don't flush caches and don't wait for PS/CS */
-#define SI_CPDMA_SKIP_BO_LIST_UPDATE   (1 << 4) /* don't update the BO list */
-#define SI_CPDMA_SKIP_ALL (SI_CPDMA_SKIP_CHECK_CS_SPACE | \
-                          SI_CPDMA_SKIP_SYNC_AFTER | \
-                          SI_CPDMA_SKIP_SYNC_BEFORE | \
-                          SI_CPDMA_SKIP_GFX_SYNC | \
-                          SI_CPDMA_SKIP_BO_LIST_UPDATE)
+#define SI_CPDMA_SKIP_CHECK_CS_SPACE (1 << 0) /* don't call need_cs_space */
+#define SI_CPDMA_SKIP_SYNC_AFTER     (1 << 1) /* don't wait for DMA after the copy */
+#define SI_CPDMA_SKIP_SYNC_BEFORE    (1 << 2) /* don't wait for DMA before the copy (RAW hazards) */
+#define SI_CPDMA_SKIP_GFX_SYNC       (1 << 3) /* don't flush caches and don't wait for PS/CS */
+#define SI_CPDMA_SKIP_BO_LIST_UPDATE (1 << 4) /* don't update the BO list */
+#define SI_CPDMA_SKIP_ALL                                                                          \
+   (SI_CPDMA_SKIP_CHECK_CS_SPACE | SI_CPDMA_SKIP_SYNC_AFTER | SI_CPDMA_SKIP_SYNC_BEFORE |          \
+    SI_CPDMA_SKIP_GFX_SYNC | SI_CPDMA_SKIP_BO_LIST_UPDATE)
  
  void si_cp_dma_wait_for_idle(struct si_context *sctx);
  void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
-                           struct pipe_resource *dst, uint64_t offset,
-                           uint64_t size, unsigned value, unsigned user_flags,
-                           enum si_coherency coher, enum si_cache_policy cache_policy);
-void si_cp_dma_copy_buffer(struct si_context *sctx,
-                          struct pipe_resource *dst, struct pipe_resource *src,
-                          uint64_t dst_offset, uint64_t src_offset, unsigned size,
-                          unsigned user_flags, enum si_coherency coher,
-                          enum si_cache_policy cache_policy);
-void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
-                             uint64_t offset, unsigned size);
+                            struct pipe_resource *dst, uint64_t offset, uint64_t size,
+                            unsigned value, unsigned user_flags, enum si_coherency coher,
+                            enum si_cache_policy cache_policy);
+void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
+                           struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
+                           unsigned size, unsigned user_flags, enum si_coherency coher,
+                           enum si_cache_policy cache_policy);
+void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset,
+                              unsigned size);
  void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only);
  void si_test_gds(struct si_context *sctx);
-void si_cp_write_data(struct si_context *sctx, struct si_resource *buf,
-                     unsigned offset, unsigned size, unsigned dst_sel,
-                     unsigned engine, const void *data);
-void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs,
-                    unsigned dst_sel, struct si_resource *dst, unsigned dst_offset,
-                    unsigned src_sel, struct si_resource *src, unsigned src_offset);
+void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset,
+                      unsigned size, unsigned dst_sel, unsigned engine, const void *data);
+void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned dst_sel,
+                     struct si_resource *dst, unsigned dst_offset, unsigned src_sel,
+                     struct si_resource *src, unsigned src_offset);
  
  /* si_debug.c */
-void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
-               struct radeon_saved_cs *saved, bool get_buffer_list);
+void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, struct radeon_saved_cs *saved,
+                bool get_buffer_list);
  void si_clear_saved_cs(struct radeon_saved_cs *saved);
  void si_destroy_saved_cs(struct si_saved_cs *scs);
  void si_auto_log_cs(void *data, struct u_log_context *log);
@@ -1394,45 +1363,41 @@ void si_log_hw_flush(struct si_context *sctx);
  void si_log_draw_state(struct si_context *sctx, struct u_log_context *log);
  void si_log_compute_state(struct si_context *sctx, struct u_log_context *log);
  void si_init_debug_functions(struct si_context *sctx);
-void si_check_vm_faults(struct si_context *sctx,
-                       struct radeon_saved_cs *saved, enum ring_type ring);
+void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved,
+                        enum ring_type ring);
  bool si_replace_shader(unsigned num, struct si_shader_binary *binary);
  
  /* si_dma_cs.c */
-void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst,
-                          uint64_t offset);
-void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
-                         uint64_t offset, uint64_t size, unsigned clear_value);
+void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset);
+void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
+                          uint64_t size, unsigned clear_value);
  void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
-                        struct pipe_resource *src, uint64_t dst_offset,
-                        uint64_t src_offset, uint64_t size);
-void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
-                      struct si_resource *dst, struct si_resource *src);
-void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
-                    struct pipe_fence_handle **fence);
-void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
-                           uint64_t offset, uint64_t size, unsigned value);
+                         struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
+                         uint64_t size);
+void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst,
+                       struct si_resource *src);
+void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence);
+void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
+                            uint64_t size, unsigned value);
  
  /* si_fence.c */
-void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
-                      unsigned event, unsigned event_flags,
-                      unsigned dst_sel, unsigned int_sel, unsigned data_sel,
-                      struct si_resource *buf, uint64_t va,
-                      uint32_t new_fence, unsigned query_type);
+void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigned event,
+                       unsigned event_flags, unsigned dst_sel, unsigned int_sel, unsigned data_sel,
+                       struct si_resource *buf, uint64_t va, uint32_t new_fence,
+                       unsigned query_type);
  unsigned si_cp_write_fence_dwords(struct si_screen *screen);
-void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
-                     uint64_t va, uint32_t ref, uint32_t mask, unsigned flags);
+void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, uint64_t va, uint32_t ref,
+                    uint32_t mask, unsigned flags);
  void si_init_fence_functions(struct si_context *ctx);
  void si_init_screen_fence_functions(struct si_screen *screen);
  struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
-                                         struct tc_unflushed_batch_token *tc_token);
+                                          struct tc_unflushed_batch_token *tc_token);
  
  /* si_get.c */
  void si_init_screen_get_functions(struct si_screen *sscreen);
  
  /* si_gfx_cs.c */
-void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
-                    struct pipe_fence_handle **fence);
+void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence);
  void si_allocate_gds(struct si_context *ctx);
  void si_begin_new_gfx_cs(struct si_context *ctx);
  void si_need_gfx_cs_space(struct si_context *ctx);
@@ -1441,36 +1406,32 @@ void si_unref_sdma_uploads(struct si_context *sctx);
  /* si_gpu_load.c */
  void si_gpu_load_kill_thread(struct si_screen *sscreen);
  uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type);
-unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
-                       uint64_t begin);
+unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin);
  
  /* si_compute.c */
  void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs);
  void si_init_compute_functions(struct si_context *sctx);
  
  /* si_compute_prim_discard.c */
-enum si_prim_discard_outcome {
-       SI_PRIM_DISCARD_ENABLED,
-       SI_PRIM_DISCARD_DISABLED,
-       SI_PRIM_DISCARD_DRAW_SPLIT,
+enum si_prim_discard_outcome
+{
+   SI_PRIM_DISCARD_ENABLED,
+   SI_PRIM_DISCARD_DISABLED,
+   SI_PRIM_DISCARD_DRAW_SPLIT,
  };
  
  void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
  enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
-                                     const struct pipe_draw_info *info,
-                                     bool primitive_restart);
+si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
+                                      bool primitive_restart);
  void si_compute_signal_gfx(struct si_context *sctx);
  void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
-                                         const struct pipe_draw_info *info,
-                                         unsigned index_size,
-                                         unsigned base_vertex,
-                                         uint64_t input_indexbuf_va,
-                                         unsigned input_indexbuf_max_elements);
-void si_initialize_prim_discard_tunables(struct si_screen *sscreen,
-                                        bool is_aux_context,
-                                        unsigned *prim_discard_vertex_count_threshold,
-                                        unsigned *index_ring_size_per_ib);
+                                          const struct pipe_draw_info *info, unsigned index_size,
+                                          unsigned base_vertex, uint64_t input_indexbuf_va,
+                                          unsigned input_indexbuf_max_elements);
+void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
+                                         unsigned *prim_discard_vertex_count_threshold,
+                                         unsigned *index_ring_size_per_ib);
  
  /* si_pipe.c */
  void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler);
@@ -1487,19 +1448,17 @@ void si_resume_queries(struct si_context *sctx);
  
  /* si_shaderlib_tgsi.c */
  void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
-                       unsigned num_layers);
+                        unsigned num_layers);
  void *si_create_fixed_func_tcs(struct si_context *sctx);
-void *si_create_dma_compute_shader(struct pipe_context *ctx,
-                                  unsigned num_dwords_per_thread,
-                                  bool dst_stream_cache_policy, bool is_copy);
+void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
+                                   bool dst_stream_cache_policy, bool is_copy);
  void *si_create_copy_image_compute_shader(struct pipe_context *ctx);
  void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx);
  void *si_clear_render_target_shader(struct pipe_context *ctx);
  void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx);
  void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx);
  void *si_create_dcc_retile_cs(struct pipe_context *ctx);
-void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples,
-                               bool is_array);
+void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, bool is_array);
  void *si_create_query_result_cs(struct si_context *sctx);
  void *gfx10_create_sh_query_result_cs(struct si_context *sctx);
  
@@ -1515,370 +1474,317 @@ void si_test_dma_perf(struct si_screen *sscreen);
  
  /* si_uvd.c */
  struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
-                                              const struct pipe_video_codec *templ);
+                                               const struct pipe_video_codec *templ);
  
  struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
-                                                const struct pipe_video_buffer *tmpl);
+                                                 const struct pipe_video_buffer *tmpl);
  
  /* si_viewport.c */
  void si_update_ngg_small_prim_precision(struct si_context *ctx);
-void si_get_small_prim_cull_info(struct si_context *sctx,
-                                struct si_small_prim_cull_info *out);
+void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out);
  void si_update_vs_viewport_state(struct si_context *ctx);
  void si_init_viewport_functions(struct si_context *ctx);
  
  /* si_texture.c */
-bool si_prepare_for_dma_blit(struct si_context *sctx,
-                            struct si_texture *dst,
-                            unsigned dst_level, unsigned dstx,
-                            unsigned dsty, unsigned dstz,
-                            struct si_texture *src,
-                            unsigned src_level,
-                            const struct pipe_box *src_box);
-void si_eliminate_fast_color_clear(struct si_context *sctx,
-                                  struct si_texture *tex);
-void si_texture_discard_cmask(struct si_screen *sscreen,
-                             struct si_texture *tex);
-bool si_init_flushed_depth_texture(struct pipe_context *ctx,
-                                  struct pipe_resource *texture);
-void si_print_texture_info(struct si_screen *sscreen,
-                          struct si_texture *tex, struct u_log_context *log);
+bool si_prepare_for_dma_blit(struct si_context *sctx, struct si_texture *dst, unsigned dst_level,
+                             unsigned dstx, unsigned dsty, unsigned dstz, struct si_texture *src,
+                             unsigned src_level, const struct pipe_box *src_box);
+void si_eliminate_fast_color_clear(struct si_context *sctx, struct si_texture *tex);
+void si_texture_discard_cmask(struct si_screen *sscreen, struct si_texture *tex);
+bool si_init_flushed_depth_texture(struct pipe_context *ctx, struct pipe_resource *texture);
+void si_print_texture_info(struct si_screen *sscreen, struct si_texture *tex,
+                           struct u_log_context *log);
  struct pipe_resource *si_texture_create(struct pipe_screen *screen,
-                                       const struct pipe_resource *templ);
-bool vi_dcc_formats_compatible(struct si_screen *sscreen,
-                              enum pipe_format format1,
-                              enum pipe_format format2);
-bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex,
-                                    unsigned level,
-                                    enum pipe_format view_format);
-void vi_disable_dcc_if_incompatible_format(struct si_context *sctx,
-                                          struct pipe_resource *tex,
-                                          unsigned level,
-                                          enum pipe_format view_format);
+                                        const struct pipe_resource *templ);
+bool vi_dcc_formats_compatible(struct si_screen *sscreen, enum pipe_format format1,
+                               enum pipe_format format2);
+bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex, unsigned level,
+                                     enum pipe_format view_format);
+void vi_disable_dcc_if_incompatible_format(struct si_context *sctx, struct pipe_resource *tex,
+                                           unsigned level, enum pipe_format view_format);
  struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe,
-                                             struct pipe_resource *texture,
-                                             const struct pipe_surface *templ,
-                                             unsigned width0, unsigned height0,
-                                             unsigned width, unsigned height);
+                                              struct pipe_resource *texture,
+                                              const struct pipe_surface *templ, unsigned width0,
+                                              unsigned height0, unsigned width, unsigned height);
  unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap);
-void vi_separate_dcc_try_enable(struct si_context *sctx,
-                               struct si_texture *tex);
-void vi_separate_dcc_start_query(struct si_context *sctx,
-                                struct si_texture *tex);
-void vi_separate_dcc_stop_query(struct si_context *sctx,
-                               struct si_texture *tex);
-void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
-                                            struct si_texture *tex);
-bool si_texture_disable_dcc(struct si_context *sctx,
-                           struct si_texture *tex);
+void vi_separate_dcc_try_enable(struct si_context *sctx, struct si_texture *tex);
+void vi_separate_dcc_start_query(struct si_context *sctx, struct si_texture *tex);
+void vi_separate_dcc_stop_query(struct si_context *sctx, struct si_texture *tex);
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, struct si_texture *tex);
+bool si_texture_disable_dcc(struct si_context *sctx, struct si_texture *tex);
  void si_init_screen_texture_functions(struct si_screen *sscreen);
  void si_init_context_texture_functions(struct si_context *sctx);
  
-
  /*
   * common helpers
   */
  
  static inline struct si_resource *si_resource(struct pipe_resource *r)
  {
-       return (struct si_resource*)r;
+   return (struct si_resource *)r;
  }
  
-static inline void
-si_resource_reference(struct si_resource **ptr, struct si_resource *res)
+static inline void si_resource_reference(struct si_resource **ptr, struct si_resource *res)
  {
-       pipe_resource_reference((struct pipe_resource **)ptr,
-                               (struct pipe_resource *)res);
+   pipe_resource_reference((struct pipe_resource **)ptr, (struct pipe_resource *)res);
  }
  
-static inline void
-si_texture_reference(struct si_texture **ptr, struct si_texture *res)
+static inline void si_texture_reference(struct si_texture **ptr, struct si_texture *res)
  {
-       pipe_resource_reference((struct pipe_resource **)ptr, &res->buffer.b.b);
+   pipe_resource_reference((struct pipe_resource **)ptr, &res->buffer.b.b);
  }
  
  static inline void
  si_shader_selector_reference(struct si_context *sctx, /* sctx can optionally be NULL */
-                            struct si_shader_selector **dst,
-                            struct si_shader_selector *src)
+                             struct si_shader_selector **dst, struct si_shader_selector *src)
  {
-       if (*dst == src)
-               return;
+   if (*dst == src)
+      return;
  
-       struct si_screen *sscreen = src ? src->screen : (*dst)->screen;
-       util_shader_reference(&sctx->b, &sscreen->live_shader_cache,
-                             (void**)dst, src);
+   struct si_screen *sscreen = src ? src->screen : (*dst)->screen;
+   util_shader_reference(&sctx->b, &sscreen->live_shader_cache, (void **)dst, src);
  }
  
-static inline bool
-vi_dcc_enabled(struct si_texture *tex, unsigned level)
+static inline bool vi_dcc_enabled(struct si_texture *tex, unsigned level)
  {
-       return tex->surface.dcc_offset && level < tex->surface.num_dcc_levels;
+   return tex->surface.dcc_offset && level < tex->surface.num_dcc_levels;
  }
  
-static inline unsigned
-si_tile_mode_index(struct si_texture *tex, unsigned level, bool stencil)
+static inline unsigned si_tile_mode_index(struct si_texture *tex, unsigned level, bool stencil)
  {
-       if (stencil)
-               return tex->surface.u.legacy.stencil_tiling_index[level];
-       else
-               return tex->surface.u.legacy.tiling_index[level];
+   if (stencil)
+      return tex->surface.u.legacy.stencil_tiling_index[level];
+   else
+      return tex->surface.u.legacy.tiling_index[level];
  }
  
-static inline unsigned
-si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx)
+static inline unsigned si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx)
  {
-       /* Don't count the needed CS space exactly and just use an upper bound.
-        *
-        * Also reserve space for stopping queries at the end of IB, because
-        * the number of active queries is unlimited in theory.
-        */
-       return 2048 + sctx->num_cs_dw_queries_suspend;
+   /* Don't count the needed CS space exactly and just use an upper bound.
+    *
+    * Also reserve space for stopping queries at the end of IB, because
+    * the number of active queries is unlimited in theory.
+    */
+   return 2048 + sctx->num_cs_dw_queries_suspend;
  }
  
-static inline void
-si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r)
+static inline void si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r)
  {
-       if (r) {
-               /* Add memory usage for need_gfx_cs_space */
-               sctx->vram += si_resource(r)->vram_usage;
-               sctx->gtt += si_resource(r)->gart_usage;
-       }
+   if (r) {
+      /* Add memory usage for need_gfx_cs_space */
+      sctx->vram += si_resource(r)->vram_usage;
+      sctx->gtt += si_resource(r)->gart_usage;
+   }
  }
  
-static inline void
-si_invalidate_draw_sh_constants(struct si_context *sctx)
+static inline void si_invalidate_draw_sh_constants(struct si_context *sctx)
  {
-       sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN;
-       sctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN;
+   sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN;
+   sctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN;
  }
  
-static inline unsigned
-si_get_atom_bit(struct si_context *sctx, struct si_atom *atom)
+static inline unsigned si_get_atom_bit(struct si_context *sctx, struct si_atom *atom)
  {
-       return 1 << (atom - sctx->atoms.array);
+   return 1 << (atom - sctx->atoms.array);
  }
  
-static inline void
-si_set_atom_dirty(struct si_context *sctx, struct si_atom *atom, bool dirty)
+static inline void si_set_atom_dirty(struct si_context *sctx, struct si_atom *atom, bool dirty)
  {
-       unsigned bit = si_get_atom_bit(sctx, atom);
+   unsigned bit = si_get_atom_bit(sctx, atom);
  
-       if (dirty)
-               sctx->dirty_atoms |= bit;
-       else
-               sctx->dirty_atoms &= ~bit;
+   if (dirty)
+      sctx->dirty_atoms |= bit;
+   else
+      sctx->dirty_atoms &= ~bit;
  }
  
-static inline bool
-si_is_atom_dirty(struct si_context *sctx, struct si_atom *atom)
+static inline bool si_is_atom_dirty(struct si_context *sctx, struct si_atom *atom)
  {
-       return (sctx->dirty_atoms & si_get_atom_bit(sctx, atom)) != 0;
+   return (sctx->dirty_atoms & si_get_atom_bit(sctx, atom)) != 0;
  }
  
-static inline void
-si_mark_atom_dirty(struct si_context *sctx, struct si_atom *atom)
+static inline void si_mark_atom_dirty(struct si_context *sctx, struct si_atom *atom)
  {
-       si_set_atom_dirty(sctx, atom, true);
+   si_set_atom_dirty(sctx, atom, true);
  }
  
  static inline struct si_shader_ctx_state *si_get_vs(struct si_context *sctx)
  {
-       if (sctx->gs_shader.cso)
-               return &sctx->gs_shader;
-       if (sctx->tes_shader.cso)
-               return &sctx->tes_shader;
+   if (sctx->gs_shader.cso)
+      return &sctx->gs_shader;
+   if (sctx->tes_shader.cso)
+      return &sctx->tes_shader;
  
-       return &sctx->vs_shader;
+   return &sctx->vs_shader;
  }
  
  static inline struct si_shader_info *si_get_vs_info(struct si_context *sctx)
  {
-       struct si_shader_ctx_state *vs = si_get_vs(sctx);
+   struct si_shader_ctx_state *vs = si_get_vs(sctx);
  
-       return vs->cso ? &vs->cso->info : NULL;
+   return vs->cso ? &vs->cso->info : NULL;
  }
  
-static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
+static inline struct si_shader *si_get_vs_state(struct si_context *sctx)
  {
-       if (sctx->gs_shader.cso &&
-           sctx->gs_shader.current &&
-           !sctx->gs_shader.current->key.as_ngg)
-               return sctx->gs_shader.cso->gs_copy_shader;
+   if (sctx->gs_shader.cso && sctx->gs_shader.current && !sctx->gs_shader.current->key.as_ngg)
+      return sctx->gs_shader.cso->gs_copy_shader;
  
-       struct si_shader_ctx_state *vs = si_get_vs(sctx);
-       return vs->current ? vs->current : NULL;
+   struct si_shader_ctx_state *vs = si_get_vs(sctx);
+   return vs->current ? vs->current : NULL;
  }
  
-static inline bool si_can_dump_shader(struct si_screen *sscreen,
-                                     unsigned processor)
+static inline bool si_can_dump_shader(struct si_screen *sscreen, unsigned processor)
  {
-       return sscreen->debug_flags & (1 << processor);
+   return sscreen->debug_flags & (1 << processor);
  }
  
  static inline bool si_get_strmout_en(struct si_context *sctx)
  {
-       return sctx->streamout.streamout_enabled ||
-              sctx->streamout.prims_gen_query_enabled;
+   return sctx->streamout.streamout_enabled || sctx->streamout.prims_gen_query_enabled;
  }
  
-static inline unsigned
-si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size)
+static inline unsigned si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size)
  {
-       unsigned alignment, tcc_cache_line_size;
-
-       /* If the upload size is less than the cache line size (e.g. 16, 32),
-        * the whole thing will fit into a cache line if we align it to its size.
-        * The idea is that multiple small uploads can share a cache line.
-        * If the upload size is greater, align it to the cache line size.
-        */
-       alignment = util_next_power_of_two(upload_size);
-       tcc_cache_line_size = sctx->screen->info.tcc_cache_line_size;
-       return MIN2(alignment, tcc_cache_line_size);
+   unsigned alignment, tcc_cache_line_size;
+
+   /* If the upload size is less than the cache line size (e.g. 16, 32),
+    * the whole thing will fit into a cache line if we align it to its size.
+    * The idea is that multiple small uploads can share a cache line.
+    * If the upload size is greater, align it to the cache line size.
+    */
+   alignment = util_next_power_of_two(upload_size);
+   tcc_cache_line_size = sctx->screen->info.tcc_cache_line_size;
+   return MIN2(alignment, tcc_cache_line_size);
  }
  
-static inline void
-si_saved_cs_reference(struct si_saved_cs **dst, struct si_saved_cs *src)
+static inline void si_saved_cs_reference(struct si_saved_cs **dst, struct si_saved_cs *src)
  {
-       if (pipe_reference(&(*dst)->reference, &src->reference))
-               si_destroy_saved_cs(*dst);
+   if (pipe_reference(&(*dst)->reference, &src->reference))
+      si_destroy_saved_cs(*dst);
  
-       *dst = src;
+   *dst = src;
  }
  
-static inline void
-si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples,
-                          bool shaders_read_metadata, bool dcc_pipe_aligned)
+static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples,
+                                              bool shaders_read_metadata, bool dcc_pipe_aligned)
  {
-       sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
-                      SI_CONTEXT_INV_VCACHE;
-
-       if (sctx->chip_class >= GFX10) {
-               if (sctx->screen->info.tcc_harvested)
-                       sctx->flags |= SI_CONTEXT_INV_L2;
-               else if (shaders_read_metadata)
-                       sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
-       } else if (sctx->chip_class == GFX9) {
-               /* Single-sample color is coherent with shaders on GFX9, but
-                * L2 metadata must be flushed if shaders read metadata.
-                * (DCC, CMASK).
-                */
-               if (num_samples >= 2 ||
-                   (shaders_read_metadata && !dcc_pipe_aligned))
-                       sctx->flags |= SI_CONTEXT_INV_L2;
-               else if (shaders_read_metadata)
-                       sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
-       } else {
-               /* GFX6-GFX8 */
-               sctx->flags |= SI_CONTEXT_INV_L2;
-       }
+   sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_INV_VCACHE;
+
+   if (sctx->chip_class >= GFX10) {
+      if (sctx->screen->info.tcc_harvested)
+         sctx->flags |= SI_CONTEXT_INV_L2;
+      else if (shaders_read_metadata)
+         sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
+   } else if (sctx->chip_class == GFX9) {
+      /* Single-sample color is coherent with shaders on GFX9, but
+       * L2 metadata must be flushed if shaders read metadata.
+       * (DCC, CMASK).
+       */
+      if (num_samples >= 2 || (shaders_read_metadata && !dcc_pipe_aligned))
+         sctx->flags |= SI_CONTEXT_INV_L2;
+      else if (shaders_read_metadata)
+         sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
+   } else {
+      /* GFX6-GFX8 */
+      sctx->flags |= SI_CONTEXT_INV_L2;
+   }
  }
  
-static inline void
-si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
-                          bool include_stencil, bool shaders_read_metadata)
+static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
+                                              bool include_stencil, bool shaders_read_metadata)
  {
-       sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB |
-                      SI_CONTEXT_INV_VCACHE;
-
-       if (sctx->chip_class >= GFX10) {
-               if (sctx->screen->info.tcc_harvested)
-                       sctx->flags |= SI_CONTEXT_INV_L2;
-               else if (shaders_read_metadata)
-                       sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
-       } else if (sctx->chip_class == GFX9) {
-               /* Single-sample depth (not stencil) is coherent with shaders
-                * on GFX9, but L2 metadata must be flushed if shaders read
-                * metadata.
-                */
-               if (num_samples >= 2 || include_stencil)
-                       sctx->flags |= SI_CONTEXT_INV_L2;
-               else if (shaders_read_metadata)
-                       sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
-       } else {
-               /* GFX6-GFX8 */
-               sctx->flags |= SI_CONTEXT_INV_L2;
-       }
+   sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_INV_VCACHE;
+
+   if (sctx->chip_class >= GFX10) {
+      if (sctx->screen->info.tcc_harvested)
+         sctx->flags |= SI_CONTEXT_INV_L2;
+      else if (shaders_read_metadata)
+         sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
+   } else if (sctx->chip_class == GFX9) {
+      /* Single-sample depth (not stencil) is coherent with shaders
+       * on GFX9, but L2 metadata must be flushed if shaders read
+       * metadata.
+       */
+      if (num_samples >= 2 || include_stencil)
+         sctx->flags |= SI_CONTEXT_INV_L2;
+      else if (shaders_read_metadata)
+         sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
+   } else {
+      /* GFX6-GFX8 */
+      sctx->flags |= SI_CONTEXT_INV_L2;
+   }
  }
  
-static inline bool
-si_can_sample_zs(struct si_texture *tex, bool stencil_sampler)
+static inline bool si_can_sample_zs(struct si_texture *tex, bool stencil_sampler)
  {
-       return (stencil_sampler && tex->can_sample_s) ||
-              (!stencil_sampler && tex->can_sample_z);
+   return (stencil_sampler && tex->can_sample_s) || (!stencil_sampler && tex->can_sample_z);
  }
  
-static inline bool
-si_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask)
+static inline bool si_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask)
  {
-       if (zs_mask == PIPE_MASK_S && tex->htile_stencil_disabled)
-               return false;
+   if (zs_mask == PIPE_MASK_S && tex->htile_stencil_disabled)
+      return false;
  
-       return tex->surface.htile_offset && level == 0;
+   return tex->surface.htile_offset && level == 0;
  }
  
-static inline bool
-vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask)
+static inline bool vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level,
+                                              unsigned zs_mask)
  {
-       assert(!tex->tc_compatible_htile || tex->surface.htile_offset);
-       return tex->tc_compatible_htile && si_htile_enabled(tex, level, zs_mask);
+   assert(!tex->tc_compatible_htile || tex->surface.htile_offset);
+   return tex->tc_compatible_htile && si_htile_enabled(tex, level, zs_mask);
  }
  
  static inline unsigned si_get_ps_iter_samples(struct si_context *sctx)
  {
-       if (sctx->ps_uses_fbfetch)
-               return sctx->framebuffer.nr_color_samples;
+   if (sctx->ps_uses_fbfetch)
+      return sctx->framebuffer.nr_color_samples;
  
-       return MIN2(sctx->ps_iter_samples, sctx->framebuffer.nr_color_samples);
+   return MIN2(sctx->ps_iter_samples, sctx->framebuffer.nr_color_samples);
  }
  
  static inline unsigned si_get_total_colormask(struct si_context *sctx)
  {
-       if (sctx->queued.named.rasterizer->rasterizer_discard)
-               return 0;
+   if (sctx->queued.named.rasterizer->rasterizer_discard)
+      return 0;
  
-       struct si_shader_selector *ps = sctx->ps_shader.cso;
-       if (!ps)
-               return 0;
+   struct si_shader_selector *ps = sctx->ps_shader.cso;
+   if (!ps)
+      return 0;
  
-       unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit &
-                            sctx->queued.named.blend->cb_target_mask;
+   unsigned colormask =
+      sctx->framebuffer.colorbuf_enabled_4bit & sctx->queued.named.blend->cb_target_mask;
  
-       if (!ps->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
-               colormask &= ps->colors_written_4bit;
-       else if (!ps->colors_written_4bit)
-               colormask = 0; /* color0 writes all cbufs, but it's not written */
+   if (!ps->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
+      colormask &= ps->colors_written_4bit;
+   else if (!ps->colors_written_4bit)
+      colormask = 0; /* color0 writes all cbufs, but it's not written */
  
-       return colormask;
+   return colormask;
  }
  
-#define UTIL_ALL_PRIM_LINE_MODES ((1 << PIPE_PRIM_LINES) | \
-                                 (1 << PIPE_PRIM_LINE_LOOP) | \
-                                 (1 << PIPE_PRIM_LINE_STRIP) | \
-                                 (1 << PIPE_PRIM_LINES_ADJACENCY) | \
-                                 (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY))
+#define UTIL_ALL_PRIM_LINE_MODES                                                                   \
+   ((1 << PIPE_PRIM_LINES) | (1 << PIPE_PRIM_LINE_LOOP) | (1 << PIPE_PRIM_LINE_STRIP) |            \
+    (1 << PIPE_PRIM_LINES_ADJACENCY) | (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY))
  
  static inline bool util_prim_is_lines(unsigned prim)
  {
-       return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0;
+   return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0;
  }
  
  static inline bool util_prim_is_points_or_lines(unsigned prim)
  {
-       return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES |
-                              (1 << PIPE_PRIM_POINTS))) != 0;
+   return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES | (1 << PIPE_PRIM_POINTS))) != 0;
  }
  
  static inline bool util_rast_prim_is_triangles(unsigned prim)
  {
-       return ((1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
-                              (1 << PIPE_PRIM_TRIANGLE_STRIP) |
-                              (1 << PIPE_PRIM_TRIANGLE_FAN) |
-                              (1 << PIPE_PRIM_QUADS) |
-                              (1 << PIPE_PRIM_QUAD_STRIP) |
-                              (1 << PIPE_PRIM_POLYGON) |
-                              (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) |
-                              (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)));
+   return ((1 << prim) &
+           ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) |
+            (1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) |
+            (1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) |
+            (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)));
  }
  
  /**
@@ -1888,20 +1794,18 @@ static inline bool util_rast_prim_is_triangles(unsigned prim)
   * \param vram      VRAM memory size not added to the buffer list yet
   * \param gtt       GTT memory size not added to the buffer list yet
   */
-static inline bool
-radeon_cs_memory_below_limit(struct si_screen *screen,
-                            struct radeon_cmdbuf *cs,
-                            uint64_t vram, uint64_t gtt)
+static inline bool radeon_cs_memory_below_limit(struct si_screen *screen, struct radeon_cmdbuf *cs,
+                                                uint64_t vram, uint64_t gtt)
  {
-       vram += cs->used_vram;
-       gtt += cs->used_gart;
+   vram += cs->used_vram;
+   gtt += cs->used_gart;
  
-       /* Anything that goes above the VRAM size should go to GTT. */
-       if (vram > screen->info.vram_size)
-               gtt += vram - screen->info.vram_size;
+   /* Anything that goes above the VRAM size should go to GTT. */
+   if (vram > screen->info.vram_size)
+      gtt += vram - screen->info.vram_size;
  
-       /* Now we just need to check if we have enough GTT. */
-       return gtt < screen->info.gart_size * 0.7;
+   /* Now we just need to check if we have enough GTT. */
+   return gtt < screen->info.gart_size * 0.7;
  }
  
  /**
@@ -1914,17 +1818,13 @@ radeon_cs_memory_below_limit(struct si_screen *screen,
   * The buffer list becomes empty after every context flush and must be
   * rebuilt.
   */
-static inline void radeon_add_to_buffer_list(struct si_context *sctx,
-                                            struct radeon_cmdbuf *cs,
-                                            struct si_resource *bo,
-                                            enum radeon_bo_usage usage,
-                                            enum radeon_bo_priority priority)
+static inline void radeon_add_to_buffer_list(struct si_context *sctx, struct radeon_cmdbuf *cs,
+                                             struct si_resource *bo, enum radeon_bo_usage usage,
+                                             enum radeon_bo_priority priority)
  {
-       assert(usage);
-       sctx->ws->cs_add_buffer(
-               cs, bo->buf,
-               (enum radeon_bo_usage)(usage | RADEON_USAGE_SYNCHRONIZED),
-               bo->domains, priority);
+   assert(usage);
+   sctx->ws->cs_add_buffer(cs, bo->buf, (enum radeon_bo_usage)(usage | RADEON_USAGE_SYNCHRONIZED),
+                           bo->domains, priority);
  }
  
  /**
@@ -1944,52 +1844,49 @@ static inline void radeon_add_to_buffer_list(struct si_context *sctx,
   * - if shader resource "enabled_mask" is not up-to-date or there is
   *   a different constraint disallowing a context flush
   */
-static inline void
-radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
-                                       struct si_resource *bo,
-                                       enum radeon_bo_usage usage,
-                                       enum radeon_bo_priority priority,
-                                       bool check_mem)
+static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
+                                                           struct si_resource *bo,
+                                                           enum radeon_bo_usage usage,
+                                                           enum radeon_bo_priority priority,
+                                                           bool check_mem)
  {
-       if (check_mem &&
-           !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs,
-                                         sctx->vram + bo->vram_usage,
-                                         sctx->gtt + bo->gart_usage))
-               si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+   if (check_mem &&
+       !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs, sctx->vram + bo->vram_usage,
+                                     sctx->gtt + bo->gart_usage))
+      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
  
-       radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority);
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority);
  }
  
  static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
  {
-       return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
+   return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
  }
  
  static inline unsigned si_get_wave_size(struct si_screen *sscreen,
-                                       enum pipe_shader_type shader_type,
-                                       bool ngg, bool es, bool prim_discard_cs)
+                                        enum pipe_shader_type shader_type, bool ngg, bool es,
+                                        bool prim_discard_cs)
  {
-       if (shader_type == PIPE_SHADER_COMPUTE)
-               return sscreen->compute_wave_size;
-       else if (shader_type == PIPE_SHADER_FRAGMENT)
-               return sscreen->ps_wave_size;
-       else if ((shader_type == PIPE_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
-                (shader_type == PIPE_SHADER_VERTEX && es && !ngg) ||
-                (shader_type == PIPE_SHADER_TESS_EVAL && es && !ngg) ||
-                (shader_type == PIPE_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */
-               return 64;
-       else
-               return sscreen->ge_wave_size;
+   if (shader_type == PIPE_SHADER_COMPUTE)
+      return sscreen->compute_wave_size;
+   else if (shader_type == PIPE_SHADER_FRAGMENT)
+      return sscreen->ps_wave_size;
+   else if ((shader_type == PIPE_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
+            (shader_type == PIPE_SHADER_VERTEX && es && !ngg) ||
+            (shader_type == PIPE_SHADER_TESS_EVAL && es && !ngg) ||
+            (shader_type == PIPE_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */
+      return 64;
+   else
+      return sscreen->ge_wave_size;
  }
  
  static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
  {
-       return si_get_wave_size(shader->selector->screen, shader->selector->type,
-                               shader->key.as_ngg, shader->key.as_es,
-                               shader->key.opt.vs_as_prim_discard_cs);
+   return si_get_wave_size(shader->selector->screen, shader->selector->type, shader->key.as_ngg,
+                           shader->key.as_es, shader->key.opt.vs_as_prim_discard_cs);
  }
  
-#define PRINT_ERR(fmt, args...) \
-       fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
+#define PRINT_ERR(fmt, args...)                                                                    \
+   fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
  
  #endif
diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c

index 0b7d53e745dd2a95ffd95b312a974f4bd4527b0a..9b63ba6997300e646d77abdf7a8b9c9cadf8b9c6 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -22,170 +22,159 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "util/u_memory.h"
  #include "si_pipe.h"
  #include "sid.h"
+#include "util/u_memory.h"
  
  void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode)
  {
-       state->last_opcode = opcode;
-       state->last_pm4 = state->ndw++;
+   state->last_opcode = opcode;
+   state->last_pm4 = state->ndw++;
  }
  
  void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw)
  {
-       state->pm4[state->ndw++] = dw;
+   state->pm4[state->ndw++] = dw;
  }
  
  void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate)
  {
-       unsigned count;
-       count = state->ndw - state->last_pm4 - 2;
-       state->pm4[state->last_pm4] =
-               PKT3(state->last_opcode, count, predicate);
+   unsigned count;
+   count = state->ndw - state->last_pm4 - 2;
+   state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate);
  
-       assert(state->ndw <= SI_PM4_MAX_DW);
+   assert(state->ndw <= SI_PM4_MAX_DW);
  }
  
  void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val)
  {
-       unsigned opcode;
+   unsigned opcode;
  
-       if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
-               opcode = PKT3_SET_CONFIG_REG;
-               reg -= SI_CONFIG_REG_OFFSET;
+   if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
+      opcode = PKT3_SET_CONFIG_REG;
+      reg -= SI_CONFIG_REG_OFFSET;
  
-       } else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
-               opcode = PKT3_SET_SH_REG;
-               reg -= SI_SH_REG_OFFSET;
+   } else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
+      opcode = PKT3_SET_SH_REG;
+      reg -= SI_SH_REG_OFFSET;
  
-       } else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
-               opcode = PKT3_SET_CONTEXT_REG;
-               reg -= SI_CONTEXT_REG_OFFSET;
+   } else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
+      opcode = PKT3_SET_CONTEXT_REG;
+      reg -= SI_CONTEXT_REG_OFFSET;
  
-       } else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
-               opcode = PKT3_SET_UCONFIG_REG;
-               reg -= CIK_UCONFIG_REG_OFFSET;
+   } else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
+      opcode = PKT3_SET_UCONFIG_REG;
+      reg -= CIK_UCONFIG_REG_OFFSET;
  
-       } else {
-               PRINT_ERR("Invalid register offset %08x!\n", reg);
-               return;
-       }
+   } else {
+      PRINT_ERR("Invalid register offset %08x!\n", reg);
+      return;
+   }
  
-       reg >>= 2;
+   reg >>= 2;
  
-       if (opcode != state->last_opcode || reg != (state->last_reg + 1)) {
-               si_pm4_cmd_begin(state, opcode);
-               si_pm4_cmd_add(state, reg);
-       }
+   if (opcode != state->last_opcode || reg != (state->last_reg + 1)) {
+      si_pm4_cmd_begin(state, opcode);
+      si_pm4_cmd_add(state, reg);
+   }
  
-       state->last_reg = reg;
-       si_pm4_cmd_add(state, val);
-       si_pm4_cmd_end(state, false);
+   state->last_reg = reg;
+   si_pm4_cmd_add(state, val);
+   si_pm4_cmd_end(state, false);
  }
  
-void si_pm4_add_bo(struct si_pm4_state *state,
-                   struct si_resource *bo,
-                   enum radeon_bo_usage usage,
-                  enum radeon_bo_priority priority)
+void si_pm4_add_bo(struct si_pm4_state *state, struct si_resource *bo, enum radeon_bo_usage usage,
+                   enum radeon_bo_priority priority)
  {
-       unsigned idx = state->nbo++;
-       assert(idx < SI_PM4_MAX_BO);
+   unsigned idx = state->nbo++;
+   assert(idx < SI_PM4_MAX_BO);
  
-       si_resource_reference(&state->bo[idx], bo);
-       state->bo_usage[idx] = usage;
-       state->bo_priority[idx] = priority;
+   si_resource_reference(&state->bo[idx], bo);
+   state->bo_usage[idx] = usage;
+   state->bo_priority[idx] = priority;
  }
  
  void si_pm4_clear_state(struct si_pm4_state *state)
  {
-       for (int i = 0; i < state->nbo; ++i)
-               si_resource_reference(&state->bo[i], NULL);
-       si_resource_reference(&state->indirect_buffer, NULL);
-       state->nbo = 0;
-       state->ndw = 0;
+   for (int i = 0; i < state->nbo; ++i)
+      si_resource_reference(&state->bo[i], NULL);
+   si_resource_reference(&state->indirect_buffer, NULL);
+   state->nbo = 0;
+   state->ndw = 0;
  }
  
-void si_pm4_free_state(struct si_context *sctx,
-                      struct si_pm4_state *state,
-                      unsigned idx)
+void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx)
  {
-       if (!state)
-               return;
+   if (!state)
+      return;
  
-       if (idx != ~0 && sctx->emitted.array[idx] == state) {
-               sctx->emitted.array[idx] = NULL;
-       }
+   if (idx != ~0 && sctx->emitted.array[idx] == state) {
+      sctx->emitted.array[idx] = NULL;
+   }
  
-       si_pm4_clear_state(state);
-       FREE(state);
+   si_pm4_clear_state(state);
+   FREE(state);
  }
  
  void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-       for (int i = 0; i < state->nbo; ++i) {
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs, state->bo[i],
-                                     state->bo_usage[i], state->bo_priority[i]);
-       }
-
-       if (!state->indirect_buffer) {
-               radeon_emit_array(cs, state->pm4, state->ndw);
-       } else {
-               struct si_resource *ib = state->indirect_buffer;
-
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs, ib,
-                                         RADEON_USAGE_READ,
-                                          RADEON_PRIO_IB2);
-
-               radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
-               radeon_emit(cs, ib->gpu_address);
-               radeon_emit(cs, ib->gpu_address >> 32);
-               radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff);
-       }
-
-       if (state->atom.emit)
-               state->atom.emit(sctx);
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   for (int i = 0; i < state->nbo; ++i) {
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, state->bo[i], state->bo_usage[i],
+                                state->bo_priority[i]);
+   }
+
+   if (!state->indirect_buffer) {
+      radeon_emit_array(cs, state->pm4, state->ndw);
+   } else {
+      struct si_resource *ib = state->indirect_buffer;
+
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, ib, RADEON_USAGE_READ, RADEON_PRIO_IB2);
+
+      radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
+      radeon_emit(cs, ib->gpu_address);
+      radeon_emit(cs, ib->gpu_address >> 32);
+      radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff);
+   }
+
+   if (state->atom.emit)
+      state->atom.emit(sctx);
  }
  
  void si_pm4_reset_emitted(struct si_context *sctx)
  {
-       memset(&sctx->emitted, 0, sizeof(sctx->emitted));
-       sctx->dirty_states |= u_bit_consecutive(0, SI_NUM_STATES);
+   memset(&sctx->emitted, 0, sizeof(sctx->emitted));
+   sctx->dirty_states |= u_bit_consecutive(0, SI_NUM_STATES);
  }
  
-void si_pm4_upload_indirect_buffer(struct si_context *sctx,
-                                  struct si_pm4_state *state)
+void si_pm4_upload_indirect_buffer(struct si_context *sctx, struct si_pm4_state *state)
  {
-       struct pipe_screen *screen = sctx->b.screen;
-       unsigned aligned_ndw = align(state->ndw, 8);
-
-       /* only supported on GFX7 and later */
-       if (sctx->chip_class < GFX7)
-               return;
-
-       assert(state->ndw);
-       assert(aligned_ndw <= SI_PM4_MAX_DW);
-
-       si_resource_reference(&state->indirect_buffer, NULL);
-       /* TODO: this hangs with 1024 or higher alignment on GFX9. */
-       state->indirect_buffer =
-               si_aligned_buffer_create(screen, 0,
-                                        PIPE_USAGE_DEFAULT, aligned_ndw * 4,
-                                        256);
-       if (!state->indirect_buffer)
-               return;
-
-       /* Pad the IB to 8 DWs to meet CP fetch alignment requirements. */
-       if (sctx->screen->info.gfx_ib_pad_with_type2) {
-               for (int i = state->ndw; i < aligned_ndw; i++)
-                       state->pm4[i] = 0x80000000; /* type2 nop packet */
-       } else {
-               for (int i = state->ndw; i < aligned_ndw; i++)
-                       state->pm4[i] = 0xffff1000; /* type3 nop packet */
-       }
-
-       pipe_buffer_write(&sctx->b, &state->indirect_buffer->b.b,
-                         0, aligned_ndw *4, state->pm4);
+   struct pipe_screen *screen = sctx->b.screen;
+   unsigned aligned_ndw = align(state->ndw, 8);
+
+   /* only supported on GFX7 and later */
+   if (sctx->chip_class < GFX7)
+      return;
+
+   assert(state->ndw);
+   assert(aligned_ndw <= SI_PM4_MAX_DW);
+
+   si_resource_reference(&state->indirect_buffer, NULL);
+   /* TODO: this hangs with 1024 or higher alignment on GFX9. */
+   state->indirect_buffer =
+      si_aligned_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, aligned_ndw * 4, 256);
+   if (!state->indirect_buffer)
+      return;
+
+   /* Pad the IB to 8 DWs to meet CP fetch alignment requirements. */
+   if (sctx->screen->info.gfx_ib_pad_with_type2) {
+      for (int i = state->ndw; i < aligned_ndw; i++)
+         state->pm4[i] = 0x80000000; /* type2 nop packet */
+   } else {
+      for (int i = state->ndw; i < aligned_ndw; i++)
+         state->pm4[i] = 0xffff1000; /* type3 nop packet */
+   }
+
+   pipe_buffer_write(&sctx->b, &state->indirect_buffer->b.b, 0, aligned_ndw * 4, state->pm4);
  }
diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h

index c91a90bc638bfbe4f4b3ca3cc1db93c9530d97ca..783833e5a42681bbd15dad3042780cb830957f94 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_pm4.h
@@ -27,8 +27,8 @@
  
  #include "radeon/radeon_winsys.h"
  
-#define SI_PM4_MAX_DW          176
-#define SI_PM4_MAX_BO          3
+#define SI_PM4_MAX_DW 176
+#define SI_PM4_MAX_BO 3
  
  // forward defines
  struct si_context;
@@ -37,32 +37,31 @@ struct si_context;
   * command buffer (AKA indirect buffer, AKA IB, AKA command stream, AKA CS).
   */
  struct si_atom {
-       void (*emit)(struct si_context *ctx);
+   void (*emit)(struct si_context *ctx);
  };
  
-struct si_pm4_state
-{
-       /* optional indirect buffer */
-       struct si_resource      *indirect_buffer;
+struct si_pm4_state {
+   /* optional indirect buffer */
+   struct si_resource *indirect_buffer;
  
-       /* PKT3_SET_*_REG handling */
-       unsigned        last_opcode;
-       unsigned        last_reg;
-       unsigned        last_pm4;
+   /* PKT3_SET_*_REG handling */
+   unsigned last_opcode;
+   unsigned last_reg;
+   unsigned last_pm4;
  
-       /* commands for the DE */
-       unsigned        ndw;
-       uint32_t        pm4[SI_PM4_MAX_DW];
+   /* commands for the DE */
+   unsigned ndw;
+   uint32_t pm4[SI_PM4_MAX_DW];
  
-       /* BO's referenced by this state */
-       unsigned                nbo;
-       struct si_resource      *bo[SI_PM4_MAX_BO];
-       enum radeon_bo_usage    bo_usage[SI_PM4_MAX_BO];
-       enum radeon_bo_priority bo_priority[SI_PM4_MAX_BO];
+   /* BO's referenced by this state */
+   unsigned nbo;
+   struct si_resource *bo[SI_PM4_MAX_BO];
+   enum radeon_bo_usage bo_usage[SI_PM4_MAX_BO];
+   enum radeon_bo_priority bo_priority[SI_PM4_MAX_BO];
  
-       /* For shader states only */
-       struct si_shader *shader;
-       struct si_atom atom;
+   /* For shader states only */
+   struct si_shader *shader;
+   struct si_atom atom;
  };
  
  void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode);
@@ -70,17 +69,12 @@ void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw);
  void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate);
  
  void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val);
-void si_pm4_add_bo(struct si_pm4_state *state,
-                  struct si_resource *bo,
-                  enum radeon_bo_usage usage,
-                  enum radeon_bo_priority priority);
-void si_pm4_upload_indirect_buffer(struct si_context *sctx,
-                                  struct si_pm4_state *state);
+void si_pm4_add_bo(struct si_pm4_state *state, struct si_resource *bo, enum radeon_bo_usage usage,
+                   enum radeon_bo_priority priority);
+void si_pm4_upload_indirect_buffer(struct si_context *sctx, struct si_pm4_state *state);
  
  void si_pm4_clear_state(struct si_pm4_state *state);
-void si_pm4_free_state(struct si_context *sctx,
-                      struct si_pm4_state *state,
-                      unsigned idx);
+void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx);
  
  void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state);
  void si_pm4_reset_emitted(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c

index bf80862e095639a2bfaff2efdbd53870d517b09d..6ad293301cb566134e08c7c8daac6361159db088 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -24,1368 +24,1312 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "si_pipe.h"
  #include "si_query.h"
-#include "util/u_memory.h"
-#include "util/u_upload_mgr.h"
+
+#include "amd/common/sid.h"
+#include "si_pipe.h"
  #include "util/os_time.h"
+#include "util/u_memory.h"
  #include "util/u_suballoc.h"
-#include "amd/common/sid.h"
+#include "util/u_upload_mgr.h"
  
  static const struct si_query_ops query_hw_ops;
  
  struct si_hw_query_params {
-       unsigned start_offset;
-       unsigned end_offset;
-       unsigned fence_offset;
-       unsigned pair_stride;
-       unsigned pair_count;
+   unsigned start_offset;
+   unsigned end_offset;
+   unsigned fence_offset;
+   unsigned pair_stride;
+   unsigned pair_count;
  };
  
  /* Queries without buffer handling or suspend/resume. */
  struct si_query_sw {
-       struct si_query b;
+   struct si_query b;
  
-       uint64_t begin_result;
-       uint64_t end_result;
+   uint64_t begin_result;
+   uint64_t end_result;
  
-       uint64_t begin_time;
-       uint64_t end_time;
+   uint64_t begin_time;
+   uint64_t end_time;
  
-       /* Fence for GPU_FINISHED. */
-       struct pipe_fence_handle *fence;
+   /* Fence for GPU_FINISHED. */
+   struct pipe_fence_handle *fence;
  };
  
-static void si_query_sw_destroy(struct si_context *sctx,
-                               struct si_query *squery)
+static void si_query_sw_destroy(struct si_context *sctx, struct si_query *squery)
  {
-       struct si_query_sw *query = (struct si_query_sw *)squery;
+   struct si_query_sw *query = (struct si_query_sw *)squery;
  
-       sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
-       FREE(query);
+   sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
+   FREE(query);
  }
  
  static enum radeon_value_id winsys_id_from_type(unsigned type)
  {
-       switch (type) {
-       case SI_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;
-       case SI_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;
-       case SI_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
-       case SI_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
-       case SI_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
-       case SI_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;
-       case SI_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
-       case SI_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
-       case SI_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER;
-       case SI_QUERY_GFX_IB_SIZE: return RADEON_GFX_IB_SIZE_COUNTER;
-       case SI_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
-       case SI_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
-       case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
-       case SI_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
-       case SI_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;
-       case SI_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
-       case SI_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
-       case SI_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
-       case SI_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
-       case SI_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME;
-       default: unreachable("query type does not correspond to winsys id");
-       }
+   switch (type) {
+   case SI_QUERY_REQUESTED_VRAM:
+      return RADEON_REQUESTED_VRAM_MEMORY;
+   case SI_QUERY_REQUESTED_GTT:
+      return RADEON_REQUESTED_GTT_MEMORY;
+   case SI_QUERY_MAPPED_VRAM:
+      return RADEON_MAPPED_VRAM;
+   case SI_QUERY_MAPPED_GTT:
+      return RADEON_MAPPED_GTT;
+   case SI_QUERY_BUFFER_WAIT_TIME:
+      return RADEON_BUFFER_WAIT_TIME_NS;
+   case SI_QUERY_NUM_MAPPED_BUFFERS:
+      return RADEON_NUM_MAPPED_BUFFERS;
+   case SI_QUERY_NUM_GFX_IBS:
+      return RADEON_NUM_GFX_IBS;
+   case SI_QUERY_NUM_SDMA_IBS:
+      return RADEON_NUM_SDMA_IBS;
+   case SI_QUERY_GFX_BO_LIST_SIZE:
+      return RADEON_GFX_BO_LIST_COUNTER;
+   case SI_QUERY_GFX_IB_SIZE:
+      return RADEON_GFX_IB_SIZE_COUNTER;
+   case SI_QUERY_NUM_BYTES_MOVED:
+      return RADEON_NUM_BYTES_MOVED;
+   case SI_QUERY_NUM_EVICTIONS:
+      return RADEON_NUM_EVICTIONS;
+   case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS:
+      return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
+   case SI_QUERY_VRAM_USAGE:
+      return RADEON_VRAM_USAGE;
+   case SI_QUERY_VRAM_VIS_USAGE:
+      return RADEON_VRAM_VIS_USAGE;
+   case SI_QUERY_GTT_USAGE:
+      return RADEON_GTT_USAGE;
+   case SI_QUERY_GPU_TEMPERATURE:
+      return RADEON_GPU_TEMPERATURE;
+   case SI_QUERY_CURRENT_GPU_SCLK:
+      return RADEON_CURRENT_SCLK;
+   case SI_QUERY_CURRENT_GPU_MCLK:
+      return RADEON_CURRENT_MCLK;
+   case SI_QUERY_CS_THREAD_BUSY:
+      return RADEON_CS_THREAD_TIME;
+   default:
+      unreachable("query type does not correspond to winsys id");
+   }
  }
  
  static int64_t si_finish_dma_get_cpu_time(struct si_context *sctx)
  {
-       struct pipe_fence_handle *fence = NULL;
+   struct pipe_fence_handle *fence = NULL;
  
-       si_flush_dma_cs(sctx, 0, &fence);
-       if (fence) {
-               sctx->ws->fence_wait(sctx->ws, fence, PIPE_TIMEOUT_INFINITE);
-               sctx->ws->fence_reference(&fence, NULL);
-       }
+   si_flush_dma_cs(sctx, 0, &fence);
+   if (fence) {
+      sctx->ws->fence_wait(sctx->ws, fence, PIPE_TIMEOUT_INFINITE);
+      sctx->ws->fence_reference(&fence, NULL);
+   }
  
-       return os_time_get_nano();
+   return os_time_get_nano();
  }
  
-static bool si_query_sw_begin(struct si_context *sctx,
-                             struct si_query *squery)
+static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
  {
-       struct si_query_sw *query = (struct si_query_sw *)squery;
-       enum radeon_value_id ws_id;
-
-       switch(query->b.type) {
-       case PIPE_QUERY_TIMESTAMP_DISJOINT:
-       case PIPE_QUERY_GPU_FINISHED:
-               break;
-       case SI_QUERY_TIME_ELAPSED_SDMA_SI:
-               query->begin_result = si_finish_dma_get_cpu_time(sctx);
-               break;
-       case SI_QUERY_DRAW_CALLS:
-               query->begin_result = sctx->num_draw_calls;
-               break;
-       case SI_QUERY_DECOMPRESS_CALLS:
-               query->begin_result = sctx->num_decompress_calls;
-               break;
-       case SI_QUERY_MRT_DRAW_CALLS:
-               query->begin_result = sctx->num_mrt_draw_calls;
-               break;
-       case SI_QUERY_PRIM_RESTART_CALLS:
-               query->begin_result = sctx->num_prim_restart_calls;
-               break;
-       case SI_QUERY_SPILL_DRAW_CALLS:
-               query->begin_result = sctx->num_spill_draw_calls;
-               break;
-       case SI_QUERY_COMPUTE_CALLS:
-               query->begin_result = sctx->num_compute_calls;
-               break;
-       case SI_QUERY_SPILL_COMPUTE_CALLS:
-               query->begin_result = sctx->num_spill_compute_calls;
-               break;
-       case SI_QUERY_DMA_CALLS:
-               query->begin_result = sctx->num_dma_calls;
-               break;
-       case SI_QUERY_CP_DMA_CALLS:
-               query->begin_result = sctx->num_cp_dma_calls;
-               break;
-       case SI_QUERY_NUM_VS_FLUSHES:
-               query->begin_result = sctx->num_vs_flushes;
-               break;
-       case SI_QUERY_NUM_PS_FLUSHES:
-               query->begin_result = sctx->num_ps_flushes;
-               break;
-       case SI_QUERY_NUM_CS_FLUSHES:
-               query->begin_result = sctx->num_cs_flushes;
-               break;
-       case SI_QUERY_NUM_CB_CACHE_FLUSHES:
-               query->begin_result = sctx->num_cb_cache_flushes;
-               break;
-       case SI_QUERY_NUM_DB_CACHE_FLUSHES:
-               query->begin_result = sctx->num_db_cache_flushes;
-               break;
-       case SI_QUERY_NUM_L2_INVALIDATES:
-               query->begin_result = sctx->num_L2_invalidates;
-               break;
-       case SI_QUERY_NUM_L2_WRITEBACKS:
-               query->begin_result = sctx->num_L2_writebacks;
-               break;
-       case SI_QUERY_NUM_RESIDENT_HANDLES:
-               query->begin_result = sctx->num_resident_handles;
-               break;
-       case SI_QUERY_TC_OFFLOADED_SLOTS:
-               query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
-               break;
-       case SI_QUERY_TC_DIRECT_SLOTS:
-               query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
-               break;
-       case SI_QUERY_TC_NUM_SYNCS:
-               query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
-               break;
-       case SI_QUERY_REQUESTED_VRAM:
-       case SI_QUERY_REQUESTED_GTT:
-       case SI_QUERY_MAPPED_VRAM:
-       case SI_QUERY_MAPPED_GTT:
-       case SI_QUERY_VRAM_USAGE:
-       case SI_QUERY_VRAM_VIS_USAGE:
-       case SI_QUERY_GTT_USAGE:
-       case SI_QUERY_GPU_TEMPERATURE:
-       case SI_QUERY_CURRENT_GPU_SCLK:
-       case SI_QUERY_CURRENT_GPU_MCLK:
-       case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
-       case SI_QUERY_NUM_MAPPED_BUFFERS:
-               query->begin_result = 0;
-               break;
-       case SI_QUERY_BUFFER_WAIT_TIME:
-       case SI_QUERY_GFX_IB_SIZE:
-       case SI_QUERY_NUM_GFX_IBS:
-       case SI_QUERY_NUM_SDMA_IBS:
-       case SI_QUERY_NUM_BYTES_MOVED:
-       case SI_QUERY_NUM_EVICTIONS:
-       case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
-               enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
-               query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
-               break;
-       }
-       case SI_QUERY_GFX_BO_LIST_SIZE:
-               ws_id = winsys_id_from_type(query->b.type);
-               query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
-               query->begin_time = sctx->ws->query_value(sctx->ws,
-                                                         RADEON_NUM_GFX_IBS);
-               break;
-       case SI_QUERY_CS_THREAD_BUSY:
-               ws_id = winsys_id_from_type(query->b.type);
-               query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
-               query->begin_time = os_time_get_nano();
-               break;
-       case SI_QUERY_GALLIUM_THREAD_BUSY:
-               query->begin_result =
-                       sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
-               query->begin_time = os_time_get_nano();
-               break;
-       case SI_QUERY_GPU_LOAD:
-       case SI_QUERY_GPU_SHADERS_BUSY:
-       case SI_QUERY_GPU_TA_BUSY:
-       case SI_QUERY_GPU_GDS_BUSY:
-       case SI_QUERY_GPU_VGT_BUSY:
-       case SI_QUERY_GPU_IA_BUSY:
-       case SI_QUERY_GPU_SX_BUSY:
-       case SI_QUERY_GPU_WD_BUSY:
-       case SI_QUERY_GPU_BCI_BUSY:
-       case SI_QUERY_GPU_SC_BUSY:
-       case SI_QUERY_GPU_PA_BUSY:
-       case SI_QUERY_GPU_DB_BUSY:
-       case SI_QUERY_GPU_CP_BUSY:
-       case SI_QUERY_GPU_CB_BUSY:
-       case SI_QUERY_GPU_SDMA_BUSY:
-       case SI_QUERY_GPU_PFP_BUSY:
-       case SI_QUERY_GPU_MEQ_BUSY:
-       case SI_QUERY_GPU_ME_BUSY:
-       case SI_QUERY_GPU_SURF_SYNC_BUSY:
-       case SI_QUERY_GPU_CP_DMA_BUSY:
-       case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
-               query->begin_result = si_begin_counter(sctx->screen,
-                                                        query->b.type);
-               break;
-       case SI_QUERY_NUM_COMPILATIONS:
-               query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
-               break;
-       case SI_QUERY_NUM_SHADERS_CREATED:
-               query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
-               break;
-       case SI_QUERY_LIVE_SHADER_CACHE_HITS:
-               query->begin_result = sctx->screen->live_shader_cache.hits;
-               break;
-       case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
-               query->begin_result = sctx->screen->live_shader_cache.misses;
-               break;
-       case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
-               query->begin_result = sctx->screen->num_memory_shader_cache_hits;
-               break;
-       case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
-               query->begin_result = sctx->screen->num_memory_shader_cache_misses;
-               break;
-       case SI_QUERY_DISK_SHADER_CACHE_HITS:
-               query->begin_result = sctx->screen->num_disk_shader_cache_hits;
-               break;
-       case SI_QUERY_DISK_SHADER_CACHE_MISSES:
-               query->begin_result = sctx->screen->num_disk_shader_cache_misses;
-               break;
-       case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-               query->begin_result = sctx->compute_num_verts_accepted;
-               break;
-       case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-               query->begin_result = sctx->compute_num_verts_rejected;
-               break;
-       case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-               query->begin_result = sctx->compute_num_verts_ineligible;
-               break;
-       case SI_QUERY_GPIN_ASIC_ID:
-       case SI_QUERY_GPIN_NUM_SIMD:
-       case SI_QUERY_GPIN_NUM_RB:
-       case SI_QUERY_GPIN_NUM_SPI:
-       case SI_QUERY_GPIN_NUM_SE:
-               break;
-       default:
-               unreachable("si_query_sw_begin: bad query type");
-       }
-
-       return true;
+   struct si_query_sw *query = (struct si_query_sw *)squery;
+   enum radeon_value_id ws_id;
+
+   switch (query->b.type) {
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+   case PIPE_QUERY_GPU_FINISHED:
+      break;
+   case SI_QUERY_TIME_ELAPSED_SDMA_SI:
+      query->begin_result = si_finish_dma_get_cpu_time(sctx);
+      break;
+   case SI_QUERY_DRAW_CALLS:
+      query->begin_result = sctx->num_draw_calls;
+      break;
+   case SI_QUERY_DECOMPRESS_CALLS:
+      query->begin_result = sctx->num_decompress_calls;
+      break;
+   case SI_QUERY_MRT_DRAW_CALLS:
+      query->begin_result = sctx->num_mrt_draw_calls;
+      break;
+   case SI_QUERY_PRIM_RESTART_CALLS:
+      query->begin_result = sctx->num_prim_restart_calls;
+      break;
+   case SI_QUERY_SPILL_DRAW_CALLS:
+      query->begin_result = sctx->num_spill_draw_calls;
+      break;
+   case SI_QUERY_COMPUTE_CALLS:
+      query->begin_result = sctx->num_compute_calls;
+      break;
+   case SI_QUERY_SPILL_COMPUTE_CALLS:
+      query->begin_result = sctx->num_spill_compute_calls;
+      break;
+   case SI_QUERY_DMA_CALLS:
+      query->begin_result = sctx->num_dma_calls;
+      break;
+   case SI_QUERY_CP_DMA_CALLS:
+      query->begin_result = sctx->num_cp_dma_calls;
+      break;
+   case SI_QUERY_NUM_VS_FLUSHES:
+      query->begin_result = sctx->num_vs_flushes;
+      break;
+   case SI_QUERY_NUM_PS_FLUSHES:
+      query->begin_result = sctx->num_ps_flushes;
+      break;
+   case SI_QUERY_NUM_CS_FLUSHES:
+      query->begin_result = sctx->num_cs_flushes;
+      break;
+   case SI_QUERY_NUM_CB_CACHE_FLUSHES:
+      query->begin_result = sctx->num_cb_cache_flushes;
+      break;
+   case SI_QUERY_NUM_DB_CACHE_FLUSHES:
+      query->begin_result = sctx->num_db_cache_flushes;
+      break;
+   case SI_QUERY_NUM_L2_INVALIDATES:
+      query->begin_result = sctx->num_L2_invalidates;
+      break;
+   case SI_QUERY_NUM_L2_WRITEBACKS:
+      query->begin_result = sctx->num_L2_writebacks;
+      break;
+   case SI_QUERY_NUM_RESIDENT_HANDLES:
+      query->begin_result = sctx->num_resident_handles;
+      break;
+   case SI_QUERY_TC_OFFLOADED_SLOTS:
+      query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
+      break;
+   case SI_QUERY_TC_DIRECT_SLOTS:
+      query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
+      break;
+   case SI_QUERY_TC_NUM_SYNCS:
+      query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
+      break;
+   case SI_QUERY_REQUESTED_VRAM:
+   case SI_QUERY_REQUESTED_GTT:
+   case SI_QUERY_MAPPED_VRAM:
+   case SI_QUERY_MAPPED_GTT:
+   case SI_QUERY_VRAM_USAGE:
+   case SI_QUERY_VRAM_VIS_USAGE:
+   case SI_QUERY_GTT_USAGE:
+   case SI_QUERY_GPU_TEMPERATURE:
+   case SI_QUERY_CURRENT_GPU_SCLK:
+   case SI_QUERY_CURRENT_GPU_MCLK:
+   case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+   case SI_QUERY_NUM_MAPPED_BUFFERS:
+      query->begin_result = 0;
+      break;
+   case SI_QUERY_BUFFER_WAIT_TIME:
+   case SI_QUERY_GFX_IB_SIZE:
+   case SI_QUERY_NUM_GFX_IBS:
+   case SI_QUERY_NUM_SDMA_IBS:
+   case SI_QUERY_NUM_BYTES_MOVED:
+   case SI_QUERY_NUM_EVICTIONS:
+   case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
+      enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+      query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+      break;
+   }
+   case SI_QUERY_GFX_BO_LIST_SIZE:
+      ws_id = winsys_id_from_type(query->b.type);
+      query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+      query->begin_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
+      break;
+   case SI_QUERY_CS_THREAD_BUSY:
+      ws_id = winsys_id_from_type(query->b.type);
+      query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+      query->begin_time = os_time_get_nano();
+      break;
+   case SI_QUERY_GALLIUM_THREAD_BUSY:
+      query->begin_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
+      query->begin_time = os_time_get_nano();
+      break;
+   case SI_QUERY_GPU_LOAD:
+   case SI_QUERY_GPU_SHADERS_BUSY:
+   case SI_QUERY_GPU_TA_BUSY:
+   case SI_QUERY_GPU_GDS_BUSY:
+   case SI_QUERY_GPU_VGT_BUSY:
+   case SI_QUERY_GPU_IA_BUSY:
+   case SI_QUERY_GPU_SX_BUSY:
+   case SI_QUERY_GPU_WD_BUSY:
+   case SI_QUERY_GPU_BCI_BUSY:
+   case SI_QUERY_GPU_SC_BUSY:
+   case SI_QUERY_GPU_PA_BUSY:
+   case SI_QUERY_GPU_DB_BUSY:
+   case SI_QUERY_GPU_CP_BUSY:
+   case SI_QUERY_GPU_CB_BUSY:
+   case SI_QUERY_GPU_SDMA_BUSY:
+   case SI_QUERY_GPU_PFP_BUSY:
+   case SI_QUERY_GPU_MEQ_BUSY:
+   case SI_QUERY_GPU_ME_BUSY:
+   case SI_QUERY_GPU_SURF_SYNC_BUSY:
+   case SI_QUERY_GPU_CP_DMA_BUSY:
+   case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+      query->begin_result = si_begin_counter(sctx->screen, query->b.type);
+      break;
+   case SI_QUERY_NUM_COMPILATIONS:
+      query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
+      break;
+   case SI_QUERY_NUM_SHADERS_CREATED:
+      query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
+      break;
+   case SI_QUERY_LIVE_SHADER_CACHE_HITS:
+      query->begin_result = sctx->screen->live_shader_cache.hits;
+      break;
+   case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
+      query->begin_result = sctx->screen->live_shader_cache.misses;
+      break;
+   case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
+      query->begin_result = sctx->screen->num_memory_shader_cache_hits;
+      break;
+   case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
+      query->begin_result = sctx->screen->num_memory_shader_cache_misses;
+      break;
+   case SI_QUERY_DISK_SHADER_CACHE_HITS:
+      query->begin_result = sctx->screen->num_disk_shader_cache_hits;
+      break;
+   case SI_QUERY_DISK_SHADER_CACHE_MISSES:
+      query->begin_result = sctx->screen->num_disk_shader_cache_misses;
+      break;
+   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
+      query->begin_result = sctx->compute_num_verts_accepted;
+      break;
+   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
+      query->begin_result = sctx->compute_num_verts_rejected;
+      break;
+   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
+      query->begin_result = sctx->compute_num_verts_ineligible;
+      break;
+   case SI_QUERY_GPIN_ASIC_ID:
+   case SI_QUERY_GPIN_NUM_SIMD:
+   case SI_QUERY_GPIN_NUM_RB:
+   case SI_QUERY_GPIN_NUM_SPI:
+   case SI_QUERY_GPIN_NUM_SE:
+      break;
+   default:
+      unreachable("si_query_sw_begin: bad query type");
+   }
+
+   return true;
  }
  
-static bool si_query_sw_end(struct si_context *sctx,
-                           struct si_query *squery)
+static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
  {
-       struct si_query_sw *query = (struct si_query_sw *)squery;
-       enum radeon_value_id ws_id;
-
-       switch(query->b.type) {
-       case PIPE_QUERY_TIMESTAMP_DISJOINT:
-               break;
-       case PIPE_QUERY_GPU_FINISHED:
-               sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
-               break;
-       case SI_QUERY_TIME_ELAPSED_SDMA_SI:
-               query->end_result = si_finish_dma_get_cpu_time(sctx);
-               break;
-       case SI_QUERY_DRAW_CALLS:
-               query->end_result = sctx->num_draw_calls;
-               break;
-       case SI_QUERY_DECOMPRESS_CALLS:
-               query->end_result = sctx->num_decompress_calls;
-               break;
-       case SI_QUERY_MRT_DRAW_CALLS:
-               query->end_result = sctx->num_mrt_draw_calls;
-               break;
-       case SI_QUERY_PRIM_RESTART_CALLS:
-               query->end_result = sctx->num_prim_restart_calls;
-               break;
-       case SI_QUERY_SPILL_DRAW_CALLS:
-               query->end_result = sctx->num_spill_draw_calls;
-               break;
-       case SI_QUERY_COMPUTE_CALLS:
-               query->end_result = sctx->num_compute_calls;
-               break;
-       case SI_QUERY_SPILL_COMPUTE_CALLS:
-               query->end_result = sctx->num_spill_compute_calls;
-               break;
-       case SI_QUERY_DMA_CALLS:
-               query->end_result = sctx->num_dma_calls;
-               break;
-       case SI_QUERY_CP_DMA_CALLS:
-               query->end_result = sctx->num_cp_dma_calls;
-               break;
-       case SI_QUERY_NUM_VS_FLUSHES:
-               query->end_result = sctx->num_vs_flushes;
-               break;
-       case SI_QUERY_NUM_PS_FLUSHES:
-               query->end_result = sctx->num_ps_flushes;
-               break;
-       case SI_QUERY_NUM_CS_FLUSHES:
-               query->end_result = sctx->num_cs_flushes;
-               break;
-       case SI_QUERY_NUM_CB_CACHE_FLUSHES:
-               query->end_result = sctx->num_cb_cache_flushes;
-               break;
-       case SI_QUERY_NUM_DB_CACHE_FLUSHES:
-               query->end_result = sctx->num_db_cache_flushes;
-               break;
-       case SI_QUERY_NUM_L2_INVALIDATES:
-               query->end_result = sctx->num_L2_invalidates;
-               break;
-       case SI_QUERY_NUM_L2_WRITEBACKS:
-               query->end_result = sctx->num_L2_writebacks;
-               break;
-       case SI_QUERY_NUM_RESIDENT_HANDLES:
-               query->end_result = sctx->num_resident_handles;
-               break;
-       case SI_QUERY_TC_OFFLOADED_SLOTS:
-               query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
-               break;
-       case SI_QUERY_TC_DIRECT_SLOTS:
-               query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
-               break;
-       case SI_QUERY_TC_NUM_SYNCS:
-               query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
-               break;
-       case SI_QUERY_REQUESTED_VRAM:
-       case SI_QUERY_REQUESTED_GTT:
-       case SI_QUERY_MAPPED_VRAM:
-       case SI_QUERY_MAPPED_GTT:
-       case SI_QUERY_VRAM_USAGE:
-       case SI_QUERY_VRAM_VIS_USAGE:
-       case SI_QUERY_GTT_USAGE:
-       case SI_QUERY_GPU_TEMPERATURE:
-       case SI_QUERY_CURRENT_GPU_SCLK:
-       case SI_QUERY_CURRENT_GPU_MCLK:
-       case SI_QUERY_BUFFER_WAIT_TIME:
-       case SI_QUERY_GFX_IB_SIZE:
-       case SI_QUERY_NUM_MAPPED_BUFFERS:
-       case SI_QUERY_NUM_GFX_IBS:
-       case SI_QUERY_NUM_SDMA_IBS:
-       case SI_QUERY_NUM_BYTES_MOVED:
-       case SI_QUERY_NUM_EVICTIONS:
-       case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
-               enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
-               query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
-               break;
-       }
-       case SI_QUERY_GFX_BO_LIST_SIZE:
-               ws_id = winsys_id_from_type(query->b.type);
-               query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
-               query->end_time = sctx->ws->query_value(sctx->ws,
-                                                       RADEON_NUM_GFX_IBS);
-               break;
-       case SI_QUERY_CS_THREAD_BUSY:
-               ws_id = winsys_id_from_type(query->b.type);
-               query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
-               query->end_time = os_time_get_nano();
-               break;
-       case SI_QUERY_GALLIUM_THREAD_BUSY:
-               query->end_result =
-                       sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
-               query->end_time = os_time_get_nano();
-               break;
-       case SI_QUERY_GPU_LOAD:
-       case SI_QUERY_GPU_SHADERS_BUSY:
-       case SI_QUERY_GPU_TA_BUSY:
-       case SI_QUERY_GPU_GDS_BUSY:
-       case SI_QUERY_GPU_VGT_BUSY:
-       case SI_QUERY_GPU_IA_BUSY:
-       case SI_QUERY_GPU_SX_BUSY:
-       case SI_QUERY_GPU_WD_BUSY:
-       case SI_QUERY_GPU_BCI_BUSY:
-       case SI_QUERY_GPU_SC_BUSY:
-       case SI_QUERY_GPU_PA_BUSY:
-       case SI_QUERY_GPU_DB_BUSY:
-       case SI_QUERY_GPU_CP_BUSY:
-       case SI_QUERY_GPU_CB_BUSY:
-       case SI_QUERY_GPU_SDMA_BUSY:
-       case SI_QUERY_GPU_PFP_BUSY:
-       case SI_QUERY_GPU_MEQ_BUSY:
-       case SI_QUERY_GPU_ME_BUSY:
-       case SI_QUERY_GPU_SURF_SYNC_BUSY:
-       case SI_QUERY_GPU_CP_DMA_BUSY:
-       case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
-               query->end_result = si_end_counter(sctx->screen,
-                                                    query->b.type,
-                                                    query->begin_result);
-               query->begin_result = 0;
-               break;
-       case SI_QUERY_NUM_COMPILATIONS:
-               query->end_result = p_atomic_read(&sctx->screen->num_compilations);
-               break;
-       case SI_QUERY_NUM_SHADERS_CREATED:
-               query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
-               break;
-       case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
-               query->end_result = sctx->last_tex_ps_draw_ratio;
-               break;
-       case SI_QUERY_LIVE_SHADER_CACHE_HITS:
-               query->end_result = sctx->screen->live_shader_cache.hits;
-               break;
-       case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
-               query->end_result = sctx->screen->live_shader_cache.misses;
-               break;
-       case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
-               query->end_result = sctx->screen->num_memory_shader_cache_hits;
-               break;
-       case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
-               query->end_result = sctx->screen->num_memory_shader_cache_misses;
-               break;
-       case SI_QUERY_DISK_SHADER_CACHE_HITS:
-               query->end_result = sctx->screen->num_disk_shader_cache_hits;
-               break;
-       case SI_QUERY_DISK_SHADER_CACHE_MISSES:
-               query->end_result = sctx->screen->num_disk_shader_cache_misses;
-               break;
-       case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-               query->end_result = sctx->compute_num_verts_accepted;
-               break;
-       case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-               query->end_result = sctx->compute_num_verts_rejected;
-               break;
-       case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-               query->end_result = sctx->compute_num_verts_ineligible;
-               break;
-       case SI_QUERY_GPIN_ASIC_ID:
-       case SI_QUERY_GPIN_NUM_SIMD:
-       case SI_QUERY_GPIN_NUM_RB:
-       case SI_QUERY_GPIN_NUM_SPI:
-       case SI_QUERY_GPIN_NUM_SE:
-               break;
-       default:
-               unreachable("si_query_sw_end: bad query type");
-       }
-
-       return true;
+   struct si_query_sw *query = (struct si_query_sw *)squery;
+   enum radeon_value_id ws_id;
+
+   switch (query->b.type) {
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      break;
+   case PIPE_QUERY_GPU_FINISHED:
+      sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
+      break;
+   case SI_QUERY_TIME_ELAPSED_SDMA_SI:
+      query->end_result = si_finish_dma_get_cpu_time(sctx);
+      break;
+   case SI_QUERY_DRAW_CALLS:
+      query->end_result = sctx->num_draw_calls;
+      break;
+   case SI_QUERY_DECOMPRESS_CALLS:
+      query->end_result = sctx->num_decompress_calls;
+      break;
+   case SI_QUERY_MRT_DRAW_CALLS:
+      query->end_result = sctx->num_mrt_draw_calls;
+      break;
+   case SI_QUERY_PRIM_RESTART_CALLS:
+      query->end_result = sctx->num_prim_restart_calls;
+      break;
+   case SI_QUERY_SPILL_DRAW_CALLS:
+      query->end_result = sctx->num_spill_draw_calls;
+      break;
+   case SI_QUERY_COMPUTE_CALLS:
+      query->end_result = sctx->num_compute_calls;
+      break;
+   case SI_QUERY_SPILL_COMPUTE_CALLS:
+      query->end_result = sctx->num_spill_compute_calls;
+      break;
+   case SI_QUERY_DMA_CALLS:
+      query->end_result = sctx->num_dma_calls;
+      break;
+   case SI_QUERY_CP_DMA_CALLS:
+      query->end_result = sctx->num_cp_dma_calls;
+      break;
+   case SI_QUERY_NUM_VS_FLUSHES:
+      query->end_result = sctx->num_vs_flushes;
+      break;
+   case SI_QUERY_NUM_PS_FLUSHES:
+      query->end_result = sctx->num_ps_flushes;
+      break;
+   case SI_QUERY_NUM_CS_FLUSHES:
+      query->end_result = sctx->num_cs_flushes;
+      break;
+   case SI_QUERY_NUM_CB_CACHE_FLUSHES:
+      query->end_result = sctx->num_cb_cache_flushes;
+      break;
+   case SI_QUERY_NUM_DB_CACHE_FLUSHES:
+      query->end_result = sctx->num_db_cache_flushes;
+      break;
+   case SI_QUERY_NUM_L2_INVALIDATES:
+      query->end_result = sctx->num_L2_invalidates;
+      break;
+   case SI_QUERY_NUM_L2_WRITEBACKS:
+      query->end_result = sctx->num_L2_writebacks;
+      break;
+   case SI_QUERY_NUM_RESIDENT_HANDLES:
+      query->end_result = sctx->num_resident_handles;
+      break;
+   case SI_QUERY_TC_OFFLOADED_SLOTS:
+      query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
+      break;
+   case SI_QUERY_TC_DIRECT_SLOTS:
+      query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
+      break;
+   case SI_QUERY_TC_NUM_SYNCS:
+      query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
+      break;
+   case SI_QUERY_REQUESTED_VRAM:
+   case SI_QUERY_REQUESTED_GTT:
+   case SI_QUERY_MAPPED_VRAM:
+   case SI_QUERY_MAPPED_GTT:
+   case SI_QUERY_VRAM_USAGE:
+   case SI_QUERY_VRAM_VIS_USAGE:
+   case SI_QUERY_GTT_USAGE:
+   case SI_QUERY_GPU_TEMPERATURE:
+   case SI_QUERY_CURRENT_GPU_SCLK:
+   case SI_QUERY_CURRENT_GPU_MCLK:
+   case SI_QUERY_BUFFER_WAIT_TIME:
+   case SI_QUERY_GFX_IB_SIZE:
+   case SI_QUERY_NUM_MAPPED_BUFFERS:
+   case SI_QUERY_NUM_GFX_IBS:
+   case SI_QUERY_NUM_SDMA_IBS:
+   case SI_QUERY_NUM_BYTES_MOVED:
+   case SI_QUERY_NUM_EVICTIONS:
+   case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
+      enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+      query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+      break;
+   }
+   case SI_QUERY_GFX_BO_LIST_SIZE:
+      ws_id = winsys_id_from_type(query->b.type);
+      query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+      query->end_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
+      break;
+   case SI_QUERY_CS_THREAD_BUSY:
+      ws_id = winsys_id_from_type(query->b.type);
+      query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+      query->end_time = os_time_get_nano();
+      break;
+   case SI_QUERY_GALLIUM_THREAD_BUSY:
+      query->end_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
+      query->end_time = os_time_get_nano();
+      break;
+   case SI_QUERY_GPU_LOAD:
+   case SI_QUERY_GPU_SHADERS_BUSY:
+   case SI_QUERY_GPU_TA_BUSY:
+   case SI_QUERY_GPU_GDS_BUSY:
+   case SI_QUERY_GPU_VGT_BUSY:
+   case SI_QUERY_GPU_IA_BUSY:
+   case SI_QUERY_GPU_SX_BUSY:
+   case SI_QUERY_GPU_WD_BUSY:
+   case SI_QUERY_GPU_BCI_BUSY:
+   case SI_QUERY_GPU_SC_BUSY:
+   case SI_QUERY_GPU_PA_BUSY:
+   case SI_QUERY_GPU_DB_BUSY:
+   case SI_QUERY_GPU_CP_BUSY:
+   case SI_QUERY_GPU_CB_BUSY:
+   case SI_QUERY_GPU_SDMA_BUSY:
+   case SI_QUERY_GPU_PFP_BUSY:
+   case SI_QUERY_GPU_MEQ_BUSY:
+   case SI_QUERY_GPU_ME_BUSY:
+   case SI_QUERY_GPU_SURF_SYNC_BUSY:
+   case SI_QUERY_GPU_CP_DMA_BUSY:
+   case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+      query->end_result = si_end_counter(sctx->screen, query->b.type, query->begin_result);
+      query->begin_result = 0;
+      break;
+   case SI_QUERY_NUM_COMPILATIONS:
+      query->end_result = p_atomic_read(&sctx->screen->num_compilations);
+      break;
+   case SI_QUERY_NUM_SHADERS_CREATED:
+      query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
+      break;
+   case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+      query->end_result = sctx->last_tex_ps_draw_ratio;
+      break;
+   case SI_QUERY_LIVE_SHADER_CACHE_HITS:
+      query->end_result = sctx->screen->live_shader_cache.hits;
+      break;
+   case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
+      query->end_result = sctx->screen->live_shader_cache.misses;
+      break;
+   case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
+      query->end_result = sctx->screen->num_memory_shader_cache_hits;
+      break;
+   case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
+      query->end_result = sctx->screen->num_memory_shader_cache_misses;
+      break;
+   case SI_QUERY_DISK_SHADER_CACHE_HITS:
+      query->end_result = sctx->screen->num_disk_shader_cache_hits;
+      break;
+   case SI_QUERY_DISK_SHADER_CACHE_MISSES:
+      query->end_result = sctx->screen->num_disk_shader_cache_misses;
+      break;
+   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
+      query->end_result = sctx->compute_num_verts_accepted;
+      break;
+   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
+      query->end_result = sctx->compute_num_verts_rejected;
+      break;
+   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
+      query->end_result = sctx->compute_num_verts_ineligible;
+      break;
+   case SI_QUERY_GPIN_ASIC_ID:
+   case SI_QUERY_GPIN_NUM_SIMD:
+   case SI_QUERY_GPIN_NUM_RB:
+   case SI_QUERY_GPIN_NUM_SPI:
+   case SI_QUERY_GPIN_NUM_SE:
+      break;
+   default:
+      unreachable("si_query_sw_end: bad query type");
+   }
+
+   return true;
  }
  
-static bool si_query_sw_get_result(struct si_context *sctx,
-                                  struct si_query *squery,
-                                  bool wait,
-                                  union pipe_query_result *result)
+static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
+                                   union pipe_query_result *result)
  {
-       struct si_query_sw *query = (struct si_query_sw *)squery;
-
-       switch (query->b.type) {
-       case PIPE_QUERY_TIMESTAMP_DISJOINT:
-               /* Convert from cycles per millisecond to cycles per second (Hz). */
-               result->timestamp_disjoint.frequency =
-                       (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
-               result->timestamp_disjoint.disjoint = false;
-               return true;
-       case PIPE_QUERY_GPU_FINISHED: {
-               struct pipe_screen *screen = sctx->b.screen;
-               struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
-
-               result->b = screen->fence_finish(screen, ctx, query->fence,
-                                                wait ? PIPE_TIMEOUT_INFINITE : 0);
-               return result->b;
-       }
-
-       case SI_QUERY_GFX_BO_LIST_SIZE:
-               result->u64 = (query->end_result - query->begin_result) /
-                             (query->end_time - query->begin_time);
-               return true;
-       case SI_QUERY_CS_THREAD_BUSY:
-       case SI_QUERY_GALLIUM_THREAD_BUSY:
-               result->u64 = (query->end_result - query->begin_result) * 100 /
-                             (query->end_time - query->begin_time);
-               return true;
-       case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-       case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-       case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-               result->u64 = ((unsigned)query->end_result -
-                              (unsigned)query->begin_result) / 3;
-               return true;
-       case SI_QUERY_GPIN_ASIC_ID:
-               result->u32 = 0;
-               return true;
-       case SI_QUERY_GPIN_NUM_SIMD:
-               result->u32 = sctx->screen->info.num_good_compute_units;
-               return true;
-       case SI_QUERY_GPIN_NUM_RB:
-               result->u32 = sctx->screen->info.num_render_backends;
-               return true;
-       case SI_QUERY_GPIN_NUM_SPI:
-               result->u32 = 1; /* all supported chips have one SPI per SE */
-               return true;
-       case SI_QUERY_GPIN_NUM_SE:
-               result->u32 = sctx->screen->info.max_se;
-               return true;
-       }
-
-       result->u64 = query->end_result - query->begin_result;
-
-       switch (query->b.type) {
-       case SI_QUERY_BUFFER_WAIT_TIME:
-       case SI_QUERY_GPU_TEMPERATURE:
-               result->u64 /= 1000;
-               break;
-       case SI_QUERY_CURRENT_GPU_SCLK:
-       case SI_QUERY_CURRENT_GPU_MCLK:
-               result->u64 *= 1000000;
-               break;
-       }
-
-       return true;
+   struct si_query_sw *query = (struct si_query_sw *)squery;
+
+   switch (query->b.type) {
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      /* Convert from cycles per millisecond to cycles per second (Hz). */
+      result->timestamp_disjoint.frequency = (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
+      result->timestamp_disjoint.disjoint = false;
+      return true;
+   case PIPE_QUERY_GPU_FINISHED: {
+      struct pipe_screen *screen = sctx->b.screen;
+      struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
+
+      result->b = screen->fence_finish(screen, ctx, query->fence, wait ? PIPE_TIMEOUT_INFINITE : 0);
+      return result->b;
+   }
+
+   case SI_QUERY_GFX_BO_LIST_SIZE:
+      result->u64 =
+         (query->end_result - query->begin_result) / (query->end_time - query->begin_time);
+      return true;
+   case SI_QUERY_CS_THREAD_BUSY:
+   case SI_QUERY_GALLIUM_THREAD_BUSY:
+      result->u64 =
+         (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
+      return true;
+   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
+   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
+   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
+      result->u64 = ((unsigned)query->end_result - (unsigned)query->begin_result) / 3;
+      return true;
+   case SI_QUERY_GPIN_ASIC_ID:
+      result->u32 = 0;
+      return true;
+   case SI_QUERY_GPIN_NUM_SIMD:
+      result->u32 = sctx->screen->info.num_good_compute_units;
+      return true;
+   case SI_QUERY_GPIN_NUM_RB:
+      result->u32 = sctx->screen->info.num_render_backends;
+      return true;
+   case SI_QUERY_GPIN_NUM_SPI:
+      result->u32 = 1; /* all supported chips have one SPI per SE */
+      return true;
+   case SI_QUERY_GPIN_NUM_SE:
+      result->u32 = sctx->screen->info.max_se;
+      return true;
+   }
+
+   result->u64 = query->end_result - query->begin_result;
+
+   switch (query->b.type) {
+   case SI_QUERY_BUFFER_WAIT_TIME:
+   case SI_QUERY_GPU_TEMPERATURE:
+      result->u64 /= 1000;
+      break;
+   case SI_QUERY_CURRENT_GPU_SCLK:
+   case SI_QUERY_CURRENT_GPU_MCLK:
+      result->u64 *= 1000000;
+      break;
+   }
+
+   return true;
  }
  
-
-static const struct si_query_ops sw_query_ops = {
-       .destroy = si_query_sw_destroy,
-       .begin = si_query_sw_begin,
-       .end = si_query_sw_end,
-       .get_result = si_query_sw_get_result,
-       .get_result_resource = NULL
-};
+static const struct si_query_ops sw_query_ops = {.destroy = si_query_sw_destroy,
+                                                 .begin = si_query_sw_begin,
+                                                 .end = si_query_sw_end,
+                                                 .get_result = si_query_sw_get_result,
+                                                 .get_result_resource = NULL};
  
  static struct pipe_query *si_query_sw_create(unsigned query_type)
  {
-       struct si_query_sw *query;
+   struct si_query_sw *query;
  
-       query = CALLOC_STRUCT(si_query_sw);
-       if (!query)
-               return NULL;
+   query = CALLOC_STRUCT(si_query_sw);
+   if (!query)
+      return NULL;
  
-       query->b.type = query_type;
-       query->b.ops = &sw_query_ops;
+   query->b.type = query_type;
+   query->b.ops = &sw_query_ops;
  
-       return (struct pipe_query *)query;
+   return (struct pipe_query *)query;
  }
  
  void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer)
  {
-       struct si_query_buffer *prev = buffer->previous;
+   struct si_query_buffer *prev = buffer->previous;
  
-       /* Release all query buffers. */
-       while (prev) {
-               struct si_query_buffer *qbuf = prev;
-               prev = prev->previous;
-               si_resource_reference(&qbuf->buf, NULL);
-               FREE(qbuf);
-       }
+   /* Release all query buffers. */
+   while (prev) {
+      struct si_query_buffer *qbuf = prev;
+      prev = prev->previous;
+      si_resource_reference(&qbuf->buf, NULL);
+      FREE(qbuf);
+   }
  
-       si_resource_reference(&buffer->buf, NULL);
+   si_resource_reference(&buffer->buf, NULL);
  }
  
  void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer)
  {
-       /* Discard all query buffers except for the oldest. */
-       while (buffer->previous) {
-               struct si_query_buffer *qbuf = buffer->previous;
-               buffer->previous = qbuf->previous;
-
-               si_resource_reference(&buffer->buf, NULL);
-               buffer->buf = qbuf->buf; /* move ownership */
-               FREE(qbuf);
-       }
-       buffer->results_end = 0;
-
-       if (!buffer->buf)
-               return;
-
-       /* Discard even the oldest buffer if it can't be mapped without a stall. */
-       if (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
-           !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
-               si_resource_reference(&buffer->buf, NULL);
-       } else {
-               buffer->unprepared = true;
-       }
+   /* Discard all query buffers except for the oldest. */
+   while (buffer->previous) {
+      struct si_query_buffer *qbuf = buffer->previous;
+      buffer->previous = qbuf->previous;
+
+      si_resource_reference(&buffer->buf, NULL);
+      buffer->buf = qbuf->buf; /* move ownership */
+      FREE(qbuf);
+   }
+   buffer->results_end = 0;
+
+   if (!buffer->buf)
+      return;
+
+   /* Discard even the oldest buffer if it can't be mapped without a stall. */
+   if (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
+       !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
+      si_resource_reference(&buffer->buf, NULL);
+   } else {
+      buffer->unprepared = true;
+   }
  }
  
  bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
-                          bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*),
-                          unsigned size)
+                           bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
+                           unsigned size)
  {
-       bool unprepared = buffer->unprepared;
-       buffer->unprepared = false;
-
-       if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
-               if (buffer->buf) {
-                       struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
-                       memcpy(qbuf, buffer, sizeof(*qbuf));
-                       buffer->previous = qbuf;
-               }
-               buffer->results_end = 0;
-
-               /* Queries are normally read by the CPU after
-                * being written by the gpu, hence staging is probably a good
-                * usage pattern.
-                */
-               struct si_screen *screen = sctx->screen;
-               unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
-               buffer->buf = si_resource(
-                       pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
-               if (unlikely(!buffer->buf))
-                       return false;
-               unprepared = true;
-       }
-
-       if (unprepared && prepare_buffer) {
-               if (unlikely(!prepare_buffer(sctx, buffer))) {
-                       si_resource_reference(&buffer->buf, NULL);
-                       return false;
-               }
-       }
-
-       return true;
+   bool unprepared = buffer->unprepared;
+   buffer->unprepared = false;
+
+   if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
+      if (buffer->buf) {
+         struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
+         memcpy(qbuf, buffer, sizeof(*qbuf));
+         buffer->previous = qbuf;
+      }
+      buffer->results_end = 0;
+
+      /* Queries are normally read by the CPU after
+       * being written by the gpu, hence staging is probably a good
+       * usage pattern.
+       */
+      struct si_screen *screen = sctx->screen;
+      unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
+      buffer->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+      if (unlikely(!buffer->buf))
+         return false;
+      unprepared = true;
+   }
+
+   if (unprepared && prepare_buffer) {
+      if (unlikely(!prepare_buffer(sctx, buffer))) {
+         si_resource_reference(&buffer->buf, NULL);
+         return false;
+      }
+   }
+
+   return true;
  }
  
-
  void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery)
  {
-       struct si_query_hw *query = (struct si_query_hw *)squery;
+   struct si_query_hw *query = (struct si_query_hw *)squery;
  
-       si_query_buffer_destroy(sctx->screen, &query->buffer);
-       si_resource_reference(&query->workaround_buf, NULL);
-       FREE(squery);
+   si_query_buffer_destroy(sctx->screen, &query->buffer);
+   si_resource_reference(&query->workaround_buf, NULL);
+   FREE(squery);
  }
  
-static bool si_query_hw_prepare_buffer(struct si_context *sctx,
-                                      struct si_query_buffer *qbuf)
+static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_buffer *qbuf)
  {
-       static const struct si_query_hw si_query_hw_s;
-       struct si_query_hw *query = container_of(qbuf, &si_query_hw_s, buffer);
-       struct si_screen *screen = sctx->screen;
-
-       /* The caller ensures that the buffer is currently unused by the GPU. */
-       uint32_t *results = screen->ws->buffer_map(qbuf->buf->buf, NULL,
-                                                  PIPE_TRANSFER_WRITE |
-                                                  PIPE_TRANSFER_UNSYNCHRONIZED);
-       if (!results)
-               return false;
-
-       memset(results, 0, qbuf->buf->b.b.width0);
-
-       if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
-           query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
-           query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
-               unsigned max_rbs = screen->info.num_render_backends;
-               unsigned enabled_rb_mask = screen->info.enabled_rb_mask;
-               unsigned num_results;
-               unsigned i, j;
-
-               /* Set top bits for unused backends. */
-               num_results = qbuf->buf->b.b.width0 / query->result_size;
-               for (j = 0; j < num_results; j++) {
-                       for (i = 0; i < max_rbs; i++) {
-                               if (!(enabled_rb_mask & (1<<i))) {
-                                       results[(i * 4)+1] = 0x80000000;
-                                       results[(i * 4)+3] = 0x80000000;
-                               }
-                       }
-                       results += 4 * max_rbs;
-               }
-       }
-
-       return true;
+   static const struct si_query_hw si_query_hw_s;
+   struct si_query_hw *query = container_of(qbuf, &si_query_hw_s, buffer);
+   struct si_screen *screen = sctx->screen;
+
+   /* The caller ensures that the buffer is currently unused by the GPU. */
+   uint32_t *results = screen->ws->buffer_map(qbuf->buf->buf, NULL,
+                                              PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED);
+   if (!results)
+      return false;
+
+   memset(results, 0, qbuf->buf->b.b.width0);
+
+   if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
+       query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+       query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+      unsigned max_rbs = screen->info.num_render_backends;
+      unsigned enabled_rb_mask = screen->info.enabled_rb_mask;
+      unsigned num_results;
+      unsigned i, j;
+
+      /* Set top bits for unused backends. */
+      num_results = qbuf->buf->b.b.width0 / query->result_size;
+      for (j = 0; j < num_results; j++) {
+         for (i = 0; i < max_rbs; i++) {
+            if (!(enabled_rb_mask & (1 << i))) {
+               results[(i * 4) + 1] = 0x80000000;
+               results[(i * 4) + 3] = 0x80000000;
+            }
+         }
+         results += 4 * max_rbs;
+      }
+   }
+
+   return true;
  }
  
-static void si_query_hw_get_result_resource(struct si_context *sctx,
-                                           struct si_query *squery,
-                                           bool wait,
-                                           enum pipe_query_value_type result_type,
-                                           int index,
-                                           struct pipe_resource *resource,
-                                           unsigned offset);
-
-static void si_query_hw_do_emit_start(struct si_context *sctx,
-                                     struct si_query_hw *query,
-                                     struct si_resource *buffer,
-                                     uint64_t va);
-static void si_query_hw_do_emit_stop(struct si_context *sctx,
-                                    struct si_query_hw *query,
-                                    struct si_resource *buffer,
-                                    uint64_t va);
-static void si_query_hw_add_result(struct si_screen *sscreen,
-                                  struct si_query_hw *, void *buffer,
-                                  union pipe_query_result *result);
-static void si_query_hw_clear_result(struct si_query_hw *,
-                                    union pipe_query_result *);
+static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
+                                            bool wait, enum pipe_query_value_type result_type,
+                                            int index, struct pipe_resource *resource,
+                                            unsigned offset);
+
+static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
+                                      struct si_resource *buffer, uint64_t va);
+static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
+                                     struct si_resource *buffer, uint64_t va);
+static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *, void *buffer,
+                                   union pipe_query_result *result);
+static void si_query_hw_clear_result(struct si_query_hw *, union pipe_query_result *);
  
  static struct si_query_hw_ops query_hw_default_hw_ops = {
-       .prepare_buffer = si_query_hw_prepare_buffer,
-       .emit_start = si_query_hw_do_emit_start,
-       .emit_stop = si_query_hw_do_emit_stop,
-       .clear_result = si_query_hw_clear_result,
-       .add_result = si_query_hw_add_result,
+   .prepare_buffer = si_query_hw_prepare_buffer,
+   .emit_start = si_query_hw_do_emit_start,
+   .emit_stop = si_query_hw_do_emit_stop,
+   .clear_result = si_query_hw_clear_result,
+   .add_result = si_query_hw_add_result,
  };
  
-static struct pipe_query *si_query_hw_create(struct si_screen *sscreen,
-                                            unsigned query_type,
-                                            unsigned index)
+static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query_type,
+                                             unsigned index)
  {
-       struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
-       if (!query)
-               return NULL;
-
-       query->b.type = query_type;
-       query->b.ops = &query_hw_ops;
-       query->ops = &query_hw_default_hw_ops;
-
-       switch (query_type) {
-       case PIPE_QUERY_OCCLUSION_COUNTER:
-       case PIPE_QUERY_OCCLUSION_PREDICATE:
-       case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-               query->result_size = 16 * sscreen->info.num_render_backends;
-               query->result_size += 16; /* for the fence + alignment */
-               query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
-               break;
-       case SI_QUERY_TIME_ELAPSED_SDMA:
-               /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */
-               query->result_size = 64;
-               break;
-       case PIPE_QUERY_TIME_ELAPSED:
-               query->result_size = 24;
-               query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
-               break;
-       case PIPE_QUERY_TIMESTAMP:
-               query->result_size = 16;
-               query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
-               query->flags = SI_QUERY_HW_FLAG_NO_START;
-               break;
-       case PIPE_QUERY_PRIMITIVES_EMITTED:
-       case PIPE_QUERY_PRIMITIVES_GENERATED:
-       case PIPE_QUERY_SO_STATISTICS:
-       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-               /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
-               query->result_size = 32;
-               query->b.num_cs_dw_suspend = 6;
-               query->stream = index;
-               break;
-       case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-               /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
-               query->result_size = 32 * SI_MAX_STREAMS;
-               query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
-               break;
-       case PIPE_QUERY_PIPELINE_STATISTICS:
-               /* 11 values on GCN. */
-               query->result_size = 11 * 16;
-               query->result_size += 8; /* for the fence + alignment */
-               query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
-               break;
-       default:
-               assert(0);
-               FREE(query);
-               return NULL;
-       }
-
-       return (struct pipe_query *)query;
+   struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
+   if (!query)
+      return NULL;
+
+   query->b.type = query_type;
+   query->b.ops = &query_hw_ops;
+   query->ops = &query_hw_default_hw_ops;
+
+   switch (query_type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      query->result_size = 16 * sscreen->info.num_render_backends;
+      query->result_size += 16; /* for the fence + alignment */
+      query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
+      break;
+   case SI_QUERY_TIME_ELAPSED_SDMA:
+      /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */
+      query->result_size = 64;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      query->result_size = 24;
+      query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      query->result_size = 16;
+      query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
+      query->flags = SI_QUERY_HW_FLAG_NO_START;
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+   case PIPE_QUERY_SO_STATISTICS:
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+      query->result_size = 32;
+      query->b.num_cs_dw_suspend = 6;
+      query->stream = index;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+      query->result_size = 32 * SI_MAX_STREAMS;
+      query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      /* 11 values on GCN. */
+      query->result_size = 11 * 16;
+      query->result_size += 8; /* for the fence + alignment */
+      query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
+      break;
+   default:
+      assert(0);
+      FREE(query);
+      return NULL;
+   }
+
+   return (struct pipe_query *)query;
  }
  
-static void si_update_occlusion_query_state(struct si_context *sctx,
-                                           unsigned type, int diff)
+static void si_update_occlusion_query_state(struct si_context *sctx, unsigned type, int diff)
  {
-       if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
-           type == PIPE_QUERY_OCCLUSION_PREDICATE ||
-           type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
-               bool old_enable = sctx->num_occlusion_queries != 0;
-               bool old_perfect_enable =
-                       sctx->num_perfect_occlusion_queries != 0;
-               bool enable, perfect_enable;
-
-               sctx->num_occlusion_queries += diff;
-               assert(sctx->num_occlusion_queries >= 0);
-
-               if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
-                       sctx->num_perfect_occlusion_queries += diff;
-                       assert(sctx->num_perfect_occlusion_queries >= 0);
-               }
-
-               enable = sctx->num_occlusion_queries != 0;
-               perfect_enable = sctx->num_perfect_occlusion_queries != 0;
-
-               if (enable != old_enable || perfect_enable != old_perfect_enable) {
-                       si_set_occlusion_query_state(sctx, old_perfect_enable);
-               }
-       }
+   if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+       type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+      bool old_enable = sctx->num_occlusion_queries != 0;
+      bool old_perfect_enable = sctx->num_perfect_occlusion_queries != 0;
+      bool enable, perfect_enable;
+
+      sctx->num_occlusion_queries += diff;
+      assert(sctx->num_occlusion_queries >= 0);
+
+      if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+         sctx->num_perfect_occlusion_queries += diff;
+         assert(sctx->num_perfect_occlusion_queries >= 0);
+      }
+
+      enable = sctx->num_occlusion_queries != 0;
+      perfect_enable = sctx->num_perfect_occlusion_queries != 0;
+
+      if (enable != old_enable || perfect_enable != old_perfect_enable) {
+         si_set_occlusion_query_state(sctx, old_perfect_enable);
+      }
+   }
  }
  
  static unsigned event_type_for_stream(unsigned stream)
  {
-       switch (stream) {
-       default:
-       case 0: return V_028A90_SAMPLE_STREAMOUTSTATS;
-       case 1: return V_028A90_SAMPLE_STREAMOUTSTATS1;
-       case 2: return V_028A90_SAMPLE_STREAMOUTSTATS2;
-       case 3: return V_028A90_SAMPLE_STREAMOUTSTATS3;
-       }
+   switch (stream) {
+   default:
+   case 0:
+      return V_028A90_SAMPLE_STREAMOUTSTATS;
+   case 1:
+      return V_028A90_SAMPLE_STREAMOUTSTATS1;
+   case 2:
+      return V_028A90_SAMPLE_STREAMOUTSTATS2;
+   case 3:
+      return V_028A90_SAMPLE_STREAMOUTSTATS3;
+   }
  }
  
-static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va,
-                                 unsigned stream)
+static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
  {
-       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-       radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
-       radeon_emit(cs, va);
-       radeon_emit(cs, va >> 32);
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+   radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
+   radeon_emit(cs, va);
+   radeon_emit(cs, va >> 32);
  }
  
-static void si_query_hw_do_emit_start(struct si_context *sctx,
-                                       struct si_query_hw *query,
-                                       struct si_resource *buffer,
-                                       uint64_t va)
+static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
+                                      struct si_resource *buffer, uint64_t va)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-       switch (query->b.type) {
-       case SI_QUERY_TIME_ELAPSED_SDMA:
-               si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address);
-               return;
-       case PIPE_QUERY_OCCLUSION_COUNTER:
-       case PIPE_QUERY_OCCLUSION_PREDICATE:
-       case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
-               radeon_emit(cs, va);
-               radeon_emit(cs, va >> 32);
-               break;
-       case PIPE_QUERY_PRIMITIVES_EMITTED:
-       case PIPE_QUERY_PRIMITIVES_GENERATED:
-       case PIPE_QUERY_SO_STATISTICS:
-       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-               emit_sample_streamout(cs, va, query->stream);
-               break;
-       case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-               for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
-                       emit_sample_streamout(cs, va + 32 * stream, stream);
-               break;
-       case PIPE_QUERY_TIME_ELAPSED:
-               si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
-                                 EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-                                 EOP_DATA_SEL_TIMESTAMP, NULL, va,
-                                 0, query->b.type);
-               break;
-       case PIPE_QUERY_PIPELINE_STATISTICS:
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
-               radeon_emit(cs, va);
-               radeon_emit(cs, va >> 32);
-               break;
-       default:
-               assert(0);
-       }
-       radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
-                                 RADEON_PRIO_QUERY);
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   switch (query->b.type) {
+   case SI_QUERY_TIME_ELAPSED_SDMA:
+      si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address);
+      return;
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+   case PIPE_QUERY_SO_STATISTICS:
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      emit_sample_streamout(cs, va, query->stream);
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
+         emit_sample_streamout(cs, va + 32 * stream, stream);
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+                        EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+      break;
+   default:
+      assert(0);
+   }
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
+                             RADEON_PRIO_QUERY);
  }
  
-static void si_query_hw_emit_start(struct si_context *sctx,
-                                  struct si_query_hw *query)
+static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query)
  {
-       uint64_t va;
+   uint64_t va;
  
-       if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
-                                  query->result_size))
-               return;
+   if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, query->result_size))
+      return;
  
-       si_update_occlusion_query_state(sctx, query->b.type, 1);
-       si_update_prims_generated_query_state(sctx, query->b.type, 1);
+   si_update_occlusion_query_state(sctx, query->b.type, 1);
+   si_update_prims_generated_query_state(sctx, query->b.type, 1);
  
-       if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
-               sctx->num_pipeline_stat_queries++;
+   if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
+      sctx->num_pipeline_stat_queries++;
  
-       if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
-               si_need_gfx_cs_space(sctx);
+   if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
+      si_need_gfx_cs_space(sctx);
  
-       va = query->buffer.buf->gpu_address + query->buffer.results_end;
-       query->ops->emit_start(sctx, query, query->buffer.buf, va);
+   va = query->buffer.buf->gpu_address + query->buffer.results_end;
+   query->ops->emit_start(sctx, query, query->buffer.buf, va);
  }
  
-static void si_query_hw_do_emit_stop(struct si_context *sctx,
-                                      struct si_query_hw *query,
-                                      struct si_resource *buffer,
-                                      uint64_t va)
+static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
+                                     struct si_resource *buffer, uint64_t va)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       uint64_t fence_va = 0;
-
-       switch (query->b.type) {
-       case SI_QUERY_TIME_ELAPSED_SDMA:
-               si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address);
-               return;
-       case PIPE_QUERY_OCCLUSION_COUNTER:
-       case PIPE_QUERY_OCCLUSION_PREDICATE:
-       case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-               va += 8;
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
-               radeon_emit(cs, va);
-               radeon_emit(cs, va >> 32);
-
-               fence_va = va + sctx->screen->info.num_render_backends * 16 - 8;
-               break;
-       case PIPE_QUERY_PRIMITIVES_EMITTED:
-       case PIPE_QUERY_PRIMITIVES_GENERATED:
-       case PIPE_QUERY_SO_STATISTICS:
-       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-               va += 16;
-               emit_sample_streamout(cs, va, query->stream);
-               break;
-       case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-               va += 16;
-               for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
-                       emit_sample_streamout(cs, va + 32 * stream, stream);
-               break;
-       case PIPE_QUERY_TIME_ELAPSED:
-               va += 8;
-               /* fall through */
-       case PIPE_QUERY_TIMESTAMP:
-               si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
-                                 EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-                                 EOP_DATA_SEL_TIMESTAMP, NULL, va,
-                                 0, query->b.type);
-               fence_va = va + 8;
-               break;
-       case PIPE_QUERY_PIPELINE_STATISTICS: {
-               unsigned sample_size = (query->result_size - 8) / 2;
-
-               va += sample_size;
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
-               radeon_emit(cs, va);
-               radeon_emit(cs, va >> 32);
-
-               fence_va = va + sample_size;
-               break;
-       }
-       default:
-               assert(0);
-       }
-       radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
-                                 RADEON_PRIO_QUERY);
-
-       if (fence_va) {
-               si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
-                                 EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-                                 EOP_DATA_SEL_VALUE_32BIT,
-                                 query->buffer.buf, fence_va, 0x80000000,
-                                 query->b.type);
-       }
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   uint64_t fence_va = 0;
+
+   switch (query->b.type) {
+   case SI_QUERY_TIME_ELAPSED_SDMA:
+      si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address);
+      return;
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      va += 8;
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+
+      fence_va = va + sctx->screen->info.num_render_backends * 16 - 8;
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+   case PIPE_QUERY_SO_STATISTICS:
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      va += 16;
+      emit_sample_streamout(cs, va, query->stream);
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      va += 16;
+      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
+         emit_sample_streamout(cs, va + 32 * stream, stream);
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      va += 8;
+      /* fall through */
+   case PIPE_QUERY_TIMESTAMP:
+      si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+                        EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
+      fence_va = va + 8;
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS: {
+      unsigned sample_size = (query->result_size - 8) / 2;
+
+      va += sample_size;
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+
+      fence_va = va + sample_size;
+      break;
+   }
+   default:
+      assert(0);
+   }
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
+                             RADEON_PRIO_QUERY);
+
+   if (fence_va) {
+      si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+                        EOP_DATA_SEL_VALUE_32BIT, query->buffer.buf, fence_va, 0x80000000,
+                        query->b.type);
+   }
  }
  
-static void si_query_hw_emit_stop(struct si_context *sctx,
-                                 struct si_query_hw *query)
+static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *query)
  {
-       uint64_t va;
+   uint64_t va;
  
-       /* The queries which need begin already called this in begin_query. */
-       if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
-               si_need_gfx_cs_space(sctx);
-               if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
-                                          query->result_size))
-                       return;
-       }
+   /* The queries which need begin already called this in begin_query. */
+   if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
+      si_need_gfx_cs_space(sctx);
+      if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
+                                 query->result_size))
+         return;
+   }
  
-       if (!query->buffer.buf)
-               return; // previous buffer allocation failure
+   if (!query->buffer.buf)
+      return; // previous buffer allocation failure
  
-       /* emit end query */
-       va = query->buffer.buf->gpu_address + query->buffer.results_end;
+   /* emit end query */
+   va = query->buffer.buf->gpu_address + query->buffer.results_end;
  
-       query->ops->emit_stop(sctx, query, query->buffer.buf, va);
+   query->ops->emit_stop(sctx, query, query->buffer.buf, va);
  
-       query->buffer.results_end += query->result_size;
+   query->buffer.results_end += query->result_size;
  
-       si_update_occlusion_query_state(sctx, query->b.type, -1);
-       si_update_prims_generated_query_state(sctx, query->b.type, -1);
+   si_update_occlusion_query_state(sctx, query->b.type, -1);
+   si_update_prims_generated_query_state(sctx, query->b.type, -1);
  
-       if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
-               sctx->num_pipeline_stat_queries--;
+   if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
+      sctx->num_pipeline_stat_queries--;
  }
  
-static void emit_set_predicate(struct si_context *ctx,
-                              struct si_resource *buf, uint64_t va,
-                              uint32_t op)
+static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, uint64_t va,
+                               uint32_t op)
  {
-       struct radeon_cmdbuf *cs = ctx->gfx_cs;
-
-       if (ctx->chip_class >= GFX9) {
-               radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
-               radeon_emit(cs, op);
-               radeon_emit(cs, va);
-               radeon_emit(cs, va >> 32);
-       } else {
-               radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
-               radeon_emit(cs, va);
-               radeon_emit(cs, op | ((va >> 32) & 0xFF));
-       }
-       radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_READ,
-                                 RADEON_PRIO_QUERY);
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+   if (ctx->chip_class >= GFX9) {
+      radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
+      radeon_emit(cs, op);
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+   } else {
+      radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
+      radeon_emit(cs, va);
+      radeon_emit(cs, op | ((va >> 32) & 0xFF));
+   }
+   radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_READ, RADEON_PRIO_QUERY);
  }
  
  static void si_emit_query_predication(struct si_context *ctx)
  {
-       struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
-       struct si_query_buffer *qbuf;
-       uint32_t op;
-       bool flag_wait, invert;
-
-       if (!query)
-               return;
-
-       if (ctx->screen->use_ngg_streamout &&
-           (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
-            query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
-               assert(!"not implemented");
-       }
-
-       invert = ctx->render_cond_invert;
-       flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
-                   ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
-
-       if (query->workaround_buf) {
-               op = PRED_OP(PREDICATION_OP_BOOL64);
-       } else {
-               switch (query->b.type) {
-               case PIPE_QUERY_OCCLUSION_COUNTER:
-               case PIPE_QUERY_OCCLUSION_PREDICATE:
-               case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-                       op = PRED_OP(PREDICATION_OP_ZPASS);
-                       break;
-               case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-               case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-                       op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
-                       invert = !invert;
-                       break;
-               default:
-                       assert(0);
-                       return;
-               }
-       }
-
-       /* if true then invert, see GL_ARB_conditional_render_inverted */
-       if (invert)
-               op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
-       else
-               op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
-
-       /* Use the value written by compute shader as a workaround. Note that
-        * the wait flag does not apply in this predication mode.
-        *
-        * The shader outputs the result value to L2. Workarounds only affect GFX8
-        * and later, where the CP reads data from L2, so we don't need an
-        * additional flush.
-        */
-       if (query->workaround_buf) {
-               uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
-               emit_set_predicate(ctx, query->workaround_buf, va, op);
-               return;
-       }
-
-       op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
-
-       /* emit predicate packets for all data blocks */
-       for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
-               unsigned results_base = 0;
-               uint64_t va_base = qbuf->buf->gpu_address;
-
-               while (results_base < qbuf->results_end) {
-                       uint64_t va = va_base + results_base;
-
-                       if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
-                               for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
-                                       emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
-
-                                       /* set CONTINUE bit for all packets except the first */
-                                       op |= PREDICATION_CONTINUE;
-                               }
-                       } else {
-                               emit_set_predicate(ctx, qbuf->buf, va, op);
-                               op |= PREDICATION_CONTINUE;
-                       }
-
-                       results_base += query->result_size;
-               }
-       }
+   struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
+   struct si_query_buffer *qbuf;
+   uint32_t op;
+   bool flag_wait, invert;
+
+   if (!query)
+      return;
+
+   if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+                                          query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
+      assert(!"not implemented");
+   }
+
+   invert = ctx->render_cond_invert;
+   flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
+               ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
+
+   if (query->workaround_buf) {
+      op = PRED_OP(PREDICATION_OP_BOOL64);
+   } else {
+      switch (query->b.type) {
+      case PIPE_QUERY_OCCLUSION_COUNTER:
+      case PIPE_QUERY_OCCLUSION_PREDICATE:
+      case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+         op = PRED_OP(PREDICATION_OP_ZPASS);
+         break;
+      case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+         op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
+         invert = !invert;
+         break;
+      default:
+         assert(0);
+         return;
+      }
+   }
+
+   /* if true then invert, see GL_ARB_conditional_render_inverted */
+   if (invert)
+      op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
+   else
+      op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
+
+   /* Use the value written by compute shader as a workaround. Note that
+    * the wait flag does not apply in this predication mode.
+    *
+    * The shader outputs the result value to L2. Workarounds only affect GFX8
+    * and later, where the CP reads data from L2, so we don't need an
+    * additional flush.
+    */
+   if (query->workaround_buf) {
+      uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
+      emit_set_predicate(ctx, query->workaround_buf, va, op);
+      return;
+   }
+
+   op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
+
+   /* emit predicate packets for all data blocks */
+   for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+      unsigned results_base = 0;
+      uint64_t va_base = qbuf->buf->gpu_address;
+
+      while (results_base < qbuf->results_end) {
+         uint64_t va = va_base + results_base;
+
+         if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+            for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+               emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
+
+               /* set CONTINUE bit for all packets except the first */
+               op |= PREDICATION_CONTINUE;
+            }
+         } else {
+            emit_set_predicate(ctx, qbuf->buf, va, op);
+            op |= PREDICATION_CONTINUE;
+         }
+
+         results_base += query->result_size;
+      }
+   }
  }
  
-static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
+static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type,
+                                          unsigned index)
  {
-       struct si_screen *sscreen =
-               (struct si_screen *)ctx->screen;
-
-       if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
-           query_type == PIPE_QUERY_GPU_FINISHED ||
-           (query_type >= PIPE_QUERY_DRIVER_SPECIFIC &&
-            query_type != SI_QUERY_TIME_ELAPSED_SDMA))
-               return si_query_sw_create(query_type);
-
-       if (sscreen->use_ngg_streamout &&
-           (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
-            query_type == PIPE_QUERY_PRIMITIVES_GENERATED ||
-            query_type == PIPE_QUERY_SO_STATISTICS ||
-            query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
-            query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
-               return gfx10_sh_query_create(sscreen, query_type, index);
-
-       return si_query_hw_create(sscreen, query_type, index);
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+
+   if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED ||
+       (query_type >= PIPE_QUERY_DRIVER_SPECIFIC && query_type != SI_QUERY_TIME_ELAPSED_SDMA))
+      return si_query_sw_create(query_type);
+
+   if (sscreen->use_ngg_streamout &&
+       (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
+        query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS ||
+        query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+        query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
+      return gfx10_sh_query_create(sscreen, query_type, index);
+
+   return si_query_hw_create(sscreen, query_type, index);
  }
  
  static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_query *squery = (struct si_query *)query;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_query *squery = (struct si_query *)query;
  
-       squery->ops->destroy(sctx, squery);
+   squery->ops->destroy(sctx, squery);
  }
  
-static bool si_begin_query(struct pipe_context *ctx,
-                          struct pipe_query *query)
+static bool si_begin_query(struct pipe_context *ctx, struct pipe_query *query)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_query *squery = (struct si_query *)query;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_query *squery = (struct si_query *)query;
  
-       return squery->ops->begin(sctx, squery);
+   return squery->ops->begin(sctx, squery);
  }
  
-bool si_query_hw_begin(struct si_context *sctx,
-                      struct si_query *squery)
+bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery)
  {
-       struct si_query_hw *query = (struct si_query_hw *)squery;
+   struct si_query_hw *query = (struct si_query_hw *)squery;
  
-       if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
-               assert(0);
-               return false;
-       }
+   if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
+      assert(0);
+      return false;
+   }
  
-       if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
-               si_query_buffer_reset(sctx, &query->buffer);
+   if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
+      si_query_buffer_reset(sctx, &query->buffer);
  
-       si_resource_reference(&query->workaround_buf, NULL);
+   si_resource_reference(&query->workaround_buf, NULL);
  
-       si_query_hw_emit_start(sctx, query);
-       if (!query->buffer.buf)
-               return false;
+   si_query_hw_emit_start(sctx, query);
+   if (!query->buffer.buf)
+      return false;
  
-       list_addtail(&query->b.active_list, &sctx->active_queries);
-       sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
-       return true;
+   list_addtail(&query->b.active_list, &sctx->active_queries);
+   sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
+   return true;
  }
  
  static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_query *squery = (struct si_query *)query;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_query *squery = (struct si_query *)query;
  
-       return squery->ops->end(sctx, squery);
+   return squery->ops->end(sctx, squery);
  }
  
-bool si_query_hw_end(struct si_context *sctx,
-                    struct si_query *squery)
+bool si_query_hw_end(struct si_context *sctx, struct si_query *squery)
  {
-       struct si_query_hw *query = (struct si_query_hw *)squery;
+   struct si_query_hw *query = (struct si_query_hw *)squery;
  
-       if (query->flags & SI_QUERY_HW_FLAG_NO_START)
-               si_query_buffer_reset(sctx, &query->buffer);
+   if (query->flags & SI_QUERY_HW_FLAG_NO_START)
+      si_query_buffer_reset(sctx, &query->buffer);
  
-       si_query_hw_emit_stop(sctx, query);
+   si_query_hw_emit_stop(sctx, query);
  
-       if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
-               list_delinit(&query->b.active_list);
-               sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
-       }
+   if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
+      list_delinit(&query->b.active_list);
+      sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
+   }
  
-       if (!query->buffer.buf)
-               return false;
+   if (!query->buffer.buf)
+      return false;
  
-       return true;
+   return true;
  }
  
-static void si_get_hw_query_params(struct si_context *sctx,
-                                  struct si_query_hw *squery, int index,
-                                  struct si_hw_query_params *params)
+static void si_get_hw_query_params(struct si_context *sctx, struct si_query_hw *squery, int index,
+                                   struct si_hw_query_params *params)
  {
-       unsigned max_rbs = sctx->screen->info.num_render_backends;
-
-       params->pair_stride = 0;
-       params->pair_count = 1;
-
-       switch (squery->b.type) {
-       case PIPE_QUERY_OCCLUSION_COUNTER:
-       case PIPE_QUERY_OCCLUSION_PREDICATE:
-       case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-               params->start_offset = 0;
-               params->end_offset = 8;
-               params->fence_offset = max_rbs * 16;
-               params->pair_stride = 16;
-               params->pair_count = max_rbs;
-               break;
-       case PIPE_QUERY_TIME_ELAPSED:
-               params->start_offset = 0;
-               params->end_offset = 8;
-               params->fence_offset = 16;
-               break;
-       case PIPE_QUERY_TIMESTAMP:
-               params->start_offset = 0;
-               params->end_offset = 0;
-               params->fence_offset = 8;
-               break;
-       case PIPE_QUERY_PRIMITIVES_EMITTED:
-               params->start_offset = 8;
-               params->end_offset = 24;
-               params->fence_offset = params->end_offset + 4;
-               break;
-       case PIPE_QUERY_PRIMITIVES_GENERATED:
-               params->start_offset = 0;
-               params->end_offset = 16;
-               params->fence_offset = params->end_offset + 4;
-               break;
-       case PIPE_QUERY_SO_STATISTICS:
-               params->start_offset = 8 - index * 8;
-               params->end_offset = 24 - index * 8;
-               params->fence_offset = params->end_offset + 4;
-               break;
-       case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-               params->pair_count = SI_MAX_STREAMS;
-               params->pair_stride = 32;
-       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-               params->start_offset = 0;
-               params->end_offset = 16;
-
-               /* We can re-use the high dword of the last 64-bit value as a
-                * fence: it is initialized as 0, and the high bit is set by
-                * the write of the streamout stats event.
-                */
-               params->fence_offset = squery->result_size - 4;
-               break;
-       case PIPE_QUERY_PIPELINE_STATISTICS:
-       {
-               static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
-               params->start_offset = offsets[index];
-               params->end_offset = 88 + offsets[index];
-               params->fence_offset = 2 * 88;
-               break;
-       }
-       default:
-               unreachable("si_get_hw_query_params unsupported");
-       }
+   unsigned max_rbs = sctx->screen->info.num_render_backends;
+
+   params->pair_stride = 0;
+   params->pair_count = 1;
+
+   switch (squery->b.type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      params->start_offset = 0;
+      params->end_offset = 8;
+      params->fence_offset = max_rbs * 16;
+      params->pair_stride = 16;
+      params->pair_count = max_rbs;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      params->start_offset = 0;
+      params->end_offset = 8;
+      params->fence_offset = 16;
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      params->start_offset = 0;
+      params->end_offset = 0;
+      params->fence_offset = 8;
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      params->start_offset = 8;
+      params->end_offset = 24;
+      params->fence_offset = params->end_offset + 4;
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      params->start_offset = 0;
+      params->end_offset = 16;
+      params->fence_offset = params->end_offset + 4;
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      params->start_offset = 8 - index * 8;
+      params->end_offset = 24 - index * 8;
+      params->fence_offset = params->end_offset + 4;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      params->pair_count = SI_MAX_STREAMS;
+      params->pair_stride = 32;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      params->start_offset = 0;
+      params->end_offset = 16;
+
+      /* We can re-use the high dword of the last 64-bit value as a
+       * fence: it is initialized as 0, and the high bit is set by
+       * the write of the streamout stats event.
+       */
+      params->fence_offset = squery->result_size - 4;
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS: {
+      static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
+      params->start_offset = offsets[index];
+      params->end_offset = 88 + offsets[index];
+      params->fence_offset = 2 * 88;
+      break;
+   }
+   default:
+      unreachable("si_get_hw_query_params unsupported");
+   }
  }
  
  static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
-                                    bool test_status_bit)
+                                     bool test_status_bit)
  {
-       uint32_t *current_result = (uint32_t*)map;
-       uint64_t start, end;
-
-       start = (uint64_t)current_result[start_index] |
-               (uint64_t)current_result[start_index+1] << 32;
-       end = (uint64_t)current_result[end_index] |
-             (uint64_t)current_result[end_index+1] << 32;
-
-       if (!test_status_bit ||
-           ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
-               return end - start;
-       }
-       return 0;
+   uint32_t *current_result = (uint32_t *)map;
+   uint64_t start, end;
+
+   start = (uint64_t)current_result[start_index] | (uint64_t)current_result[start_index + 1] << 32;
+   end = (uint64_t)current_result[end_index] | (uint64_t)current_result[end_index + 1] << 32;
+
+   if (!test_status_bit || ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
+      return end - start;
+   }
+   return 0;
  }
  
-static void si_query_hw_add_result(struct si_screen *sscreen,
-                                    struct si_query_hw *query,
-                                    void *buffer,
-                                    union pipe_query_result *result)
+static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *query,
+                                   void *buffer, union pipe_query_result *result)
  {
-       unsigned max_rbs = sscreen->info.num_render_backends;
-
-       switch (query->b.type) {
-       case PIPE_QUERY_OCCLUSION_COUNTER: {
-               for (unsigned i = 0; i < max_rbs; ++i) {
-                       unsigned results_base = i * 16;
-                       result->u64 +=
-                               si_query_read_result(buffer + results_base, 0, 2, true);
-               }
-               break;
-       }
-       case PIPE_QUERY_OCCLUSION_PREDICATE:
-       case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
-               for (unsigned i = 0; i < max_rbs; ++i) {
-                       unsigned results_base = i * 16;
-                       result->b = result->b ||
-                               si_query_read_result(buffer + results_base, 0, 2, true) != 0;
-               }
-               break;
-       }
-       case PIPE_QUERY_TIME_ELAPSED:
-               result->u64 += si_query_read_result(buffer, 0, 2, false);
-               break;
-       case SI_QUERY_TIME_ELAPSED_SDMA:
-               result->u64 += si_query_read_result(buffer, 0, 32/4, false);
-               break;
-       case PIPE_QUERY_TIMESTAMP:
-               result->u64 = *(uint64_t*)buffer;
-               break;
-       case PIPE_QUERY_PRIMITIVES_EMITTED:
-               /* SAMPLE_STREAMOUTSTATS stores this structure:
-                * {
-                *    u64 NumPrimitivesWritten;
-                *    u64 PrimitiveStorageNeeded;
-                * }
-                * We only need NumPrimitivesWritten here. */
-               result->u64 += si_query_read_result(buffer, 2, 6, true);
-               break;
-       case PIPE_QUERY_PRIMITIVES_GENERATED:
-               /* Here we read PrimitiveStorageNeeded. */
-               result->u64 += si_query_read_result(buffer, 0, 4, true);
-               break;
-       case PIPE_QUERY_SO_STATISTICS:
-               result->so_statistics.num_primitives_written +=
-                       si_query_read_result(buffer, 2, 6, true);
-               result->so_statistics.primitives_storage_needed +=
-                       si_query_read_result(buffer, 0, 4, true);
-               break;
-       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-               result->b = result->b ||
-                       si_query_read_result(buffer, 2, 6, true) !=
-                       si_query_read_result(buffer, 0, 4, true);
-               break;
-       case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-               for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
-                       result->b = result->b ||
-                               si_query_read_result(buffer, 2, 6, true) !=
-                               si_query_read_result(buffer, 0, 4, true);
-                       buffer = (char *)buffer + 32;
-               }
-               break;
-       case PIPE_QUERY_PIPELINE_STATISTICS:
-               result->pipeline_statistics.ps_invocations +=
-                       si_query_read_result(buffer, 0, 22, false);
-               result->pipeline_statistics.c_primitives +=
-                       si_query_read_result(buffer, 2, 24, false);
-               result->pipeline_statistics.c_invocations +=
-                       si_query_read_result(buffer, 4, 26, false);
-               result->pipeline_statistics.vs_invocations +=
-                       si_query_read_result(buffer, 6, 28, false);
-               result->pipeline_statistics.gs_invocations +=
-                       si_query_read_result(buffer, 8, 30, false);
-               result->pipeline_statistics.gs_primitives +=
-                       si_query_read_result(buffer, 10, 32, false);
-               result->pipeline_statistics.ia_primitives +=
-                       si_query_read_result(buffer, 12, 34, false);
-               result->pipeline_statistics.ia_vertices +=
-                       si_query_read_result(buffer, 14, 36, false);
-               result->pipeline_statistics.hs_invocations +=
-                       si_query_read_result(buffer, 16, 38, false);
-               result->pipeline_statistics.ds_invocations +=
-                       si_query_read_result(buffer, 18, 40, false);
-               result->pipeline_statistics.cs_invocations +=
-                       si_query_read_result(buffer, 20, 42, false);
+   unsigned max_rbs = sscreen->info.num_render_backends;
+
+   switch (query->b.type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER: {
+      for (unsigned i = 0; i < max_rbs; ++i) {
+         unsigned results_base = i * 16;
+         result->u64 += si_query_read_result(buffer + results_base, 0, 2, true);
+      }
+      break;
+   }
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
+      for (unsigned i = 0; i < max_rbs; ++i) {
+         unsigned results_base = i * 16;
+         result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0;
+      }
+      break;
+   }
+   case PIPE_QUERY_TIME_ELAPSED:
+      result->u64 += si_query_read_result(buffer, 0, 2, false);
+      break;
+   case SI_QUERY_TIME_ELAPSED_SDMA:
+      result->u64 += si_query_read_result(buffer, 0, 32 / 4, false);
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      result->u64 = *(uint64_t *)buffer;
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      /* SAMPLE_STREAMOUTSTATS stores this structure:
+       * {
+       *    u64 NumPrimitivesWritten;
+       *    u64 PrimitiveStorageNeeded;
+       * }
+       * We only need NumPrimitivesWritten here. */
+      result->u64 += si_query_read_result(buffer, 2, 6, true);
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      /* Here we read PrimitiveStorageNeeded. */
+      result->u64 += si_query_read_result(buffer, 0, 4, true);
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      result->so_statistics.num_primitives_written += si_query_read_result(buffer, 2, 6, true);
+      result->so_statistics.primitives_storage_needed += si_query_read_result(buffer, 0, 4, true);
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
+                                  si_query_read_result(buffer, 0, 4, true);
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+         result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
+                                     si_query_read_result(buffer, 0, 4, true);
+         buffer = (char *)buffer + 32;
+      }
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      result->pipeline_statistics.ps_invocations += si_query_read_result(buffer, 0, 22, false);
+      result->pipeline_statistics.c_primitives += si_query_read_result(buffer, 2, 24, false);
+      result->pipeline_statistics.c_invocations += si_query_read_result(buffer, 4, 26, false);
+      result->pipeline_statistics.vs_invocations += si_query_read_result(buffer, 6, 28, false);
+      result->pipeline_statistics.gs_invocations += si_query_read_result(buffer, 8, 30, false);
+      result->pipeline_statistics.gs_primitives += si_query_read_result(buffer, 10, 32, false);
+      result->pipeline_statistics.ia_primitives += si_query_read_result(buffer, 12, 34, false);
+      result->pipeline_statistics.ia_vertices += si_query_read_result(buffer, 14, 36, false);
+      result->pipeline_statistics.hs_invocations += si_query_read_result(buffer, 16, 38, false);
+      result->pipeline_statistics.ds_invocations += si_query_read_result(buffer, 18, 40, false);
+      result->pipeline_statistics.cs_invocations += si_query_read_result(buffer, 20, 42, false);
  #if 0 /* for testing */
                 printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
                        "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
@@ -1402,444 +1346,416 @@ static void si_query_hw_add_result(struct si_screen *sscreen,
                        result->pipeline_statistics.ps_invocations,
                        result->pipeline_statistics.cs_invocations);
  #endif
-               break;
-       default:
-               assert(0);
-       }
+      break;
+   default:
+      assert(0);
+   }
  }
  
  void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
  {
-       si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
+   si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
  }
  
  void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
  {
-       si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
+   si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
  }
  
  static const struct si_query_ops query_hw_ops = {
-       .destroy = si_query_hw_destroy,
-       .begin = si_query_hw_begin,
-       .end = si_query_hw_end,
-       .get_result = si_query_hw_get_result,
-       .get_result_resource = si_query_hw_get_result_resource,
-
-       .suspend = si_query_hw_suspend,
-       .resume = si_query_hw_resume,
+   .destroy = si_query_hw_destroy,
+   .begin = si_query_hw_begin,
+   .end = si_query_hw_end,
+   .get_result = si_query_hw_get_result,
+   .get_result_resource = si_query_hw_get_result_resource,
+
+   .suspend = si_query_hw_suspend,
+   .resume = si_query_hw_resume,
  };
  
-static bool si_get_query_result(struct pipe_context *ctx,
-                               struct pipe_query *query, bool wait,
-                               union pipe_query_result *result)
+static bool si_get_query_result(struct pipe_context *ctx, struct pipe_query *query, bool wait,
+                                union pipe_query_result *result)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_query *squery = (struct si_query *)query;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_query *squery = (struct si_query *)query;
  
-       return squery->ops->get_result(sctx, squery, wait, result);
+   return squery->ops->get_result(sctx, squery, wait, result);
  }
  
-static void si_get_query_result_resource(struct pipe_context *ctx,
-                                        struct pipe_query *query,
-                                        bool wait,
-                                        enum pipe_query_value_type result_type,
-                                        int index,
-                                        struct pipe_resource *resource,
-                                        unsigned offset)
+static void si_get_query_result_resource(struct pipe_context *ctx, struct pipe_query *query,
+                                         bool wait, enum pipe_query_value_type result_type,
+                                         int index, struct pipe_resource *resource, unsigned offset)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_query *squery = (struct si_query *)query;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_query *squery = (struct si_query *)query;
  
-       squery->ops->get_result_resource(sctx, squery, wait, result_type, index,
-                                        resource, offset);
+   squery->ops->get_result_resource(sctx, squery, wait, result_type, index, resource, offset);
  }
  
-static void si_query_hw_clear_result(struct si_query_hw *query,
-                                      union pipe_query_result *result)
+static void si_query_hw_clear_result(struct si_query_hw *query, union pipe_query_result *result)
  {
-       util_query_clear_result(result, query->b.type);
+   util_query_clear_result(result, query->b.type);
  }
  
-bool si_query_hw_get_result(struct si_context *sctx,
-                           struct si_query *squery,
-                           bool wait, union pipe_query_result *result)
+bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
+                            union pipe_query_result *result)
  {
-       struct si_screen *sscreen = sctx->screen;
-       struct si_query_hw *query = (struct si_query_hw *)squery;
-       struct si_query_buffer *qbuf;
-
-       query->ops->clear_result(query, result);
-
-       for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
-               unsigned usage = PIPE_TRANSFER_READ |
-                                (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
-               unsigned results_base = 0;
-               void *map;
-
-               if (squery->b.flushed)
-                       map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
-               else
-                       map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
-
-               if (!map)
-                       return false;
-
-               while (results_base != qbuf->results_end) {
-                       query->ops->add_result(sscreen, query, map + results_base,
-                                              result);
-                       results_base += query->result_size;
-               }
-       }
-
-       /* Convert the time to expected units. */
-       if (squery->type == PIPE_QUERY_TIME_ELAPSED ||
-           squery->type == SI_QUERY_TIME_ELAPSED_SDMA ||
-           squery->type == PIPE_QUERY_TIMESTAMP) {
-               result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
-       }
-       return true;
+   struct si_screen *sscreen = sctx->screen;
+   struct si_query_hw *query = (struct si_query_hw *)squery;
+   struct si_query_buffer *qbuf;
+
+   query->ops->clear_result(query, result);
+
+   for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+      unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+      unsigned results_base = 0;
+      void *map;
+
+      if (squery->b.flushed)
+         map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+      else
+         map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+
+      if (!map)
+         return false;
+
+      while (results_base != qbuf->results_end) {
+         query->ops->add_result(sscreen, query, map + results_base, result);
+         results_base += query->result_size;
+      }
+   }
+
+   /* Convert the time to expected units. */
+   if (squery->type == PIPE_QUERY_TIME_ELAPSED || squery->type == SI_QUERY_TIME_ELAPSED_SDMA ||
+       squery->type == PIPE_QUERY_TIMESTAMP) {
+      result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
+   }
+   return true;
  }
  
-static void si_query_hw_get_result_resource(struct si_context *sctx,
-                                              struct si_query *squery,
-                                              bool wait,
-                                              enum pipe_query_value_type result_type,
-                                              int index,
-                                              struct pipe_resource *resource,
-                                              unsigned offset)
+static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
+                                            bool wait, enum pipe_query_value_type result_type,
+                                            int index, struct pipe_resource *resource,
+                                            unsigned offset)
  {
-       struct si_query_hw *query = (struct si_query_hw *)squery;
-       struct si_query_buffer *qbuf;
-       struct si_query_buffer *qbuf_prev;
-       struct pipe_resource *tmp_buffer = NULL;
-       unsigned tmp_buffer_offset = 0;
-       struct si_qbo_state saved_state = {};
-       struct pipe_grid_info grid = {};
-       struct pipe_constant_buffer constant_buffer = {};
-       struct pipe_shader_buffer ssbo[3];
-       struct si_hw_query_params params;
-       struct {
-               uint32_t end_offset;
-               uint32_t result_stride;
-               uint32_t result_count;
-               uint32_t config;
-               uint32_t fence_offset;
-               uint32_t pair_stride;
-               uint32_t pair_count;
-       } consts;
-
-       if (!sctx->query_result_shader) {
-               sctx->query_result_shader = si_create_query_result_cs(sctx);
-               if (!sctx->query_result_shader)
-                       return;
-       }
-
-       if (query->buffer.previous) {
-               u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
-                                    &tmp_buffer_offset, &tmp_buffer);
-               if (!tmp_buffer)
-                       return;
-       }
-
-       si_save_qbo_state(sctx, &saved_state);
-
-       si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, &params);
-       consts.end_offset = params.end_offset - params.start_offset;
-       consts.fence_offset = params.fence_offset - params.start_offset;
-       consts.result_stride = query->result_size;
-       consts.pair_stride = params.pair_stride;
-       consts.pair_count = params.pair_count;
-
-       constant_buffer.buffer_size = sizeof(consts);
-       constant_buffer.user_buffer = &consts;
-
-       ssbo[1].buffer = tmp_buffer;
-       ssbo[1].buffer_offset = tmp_buffer_offset;
-       ssbo[1].buffer_size = 16;
-
-       ssbo[2] = ssbo[1];
-
-       sctx->b.bind_compute_state(&sctx->b, sctx->query_result_shader);
-
-       grid.block[0] = 1;
-       grid.block[1] = 1;
-       grid.block[2] = 1;
-       grid.grid[0] = 1;
-       grid.grid[1] = 1;
-       grid.grid[2] = 1;
-
-       consts.config = 0;
-       if (index < 0)
-               consts.config |= 4;
-       if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
-           query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
-               consts.config |= 8;
-       else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
-                query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
-               consts.config |= 8 | 256;
-       else if (query->b.type == PIPE_QUERY_TIMESTAMP ||
-                query->b.type == PIPE_QUERY_TIME_ELAPSED)
-               consts.config |= 32;
-
-       switch (result_type) {
-       case PIPE_QUERY_TYPE_U64:
-       case PIPE_QUERY_TYPE_I64:
-               consts.config |= 64;
-               break;
-       case PIPE_QUERY_TYPE_I32:
-               consts.config |= 128;
-               break;
-       case PIPE_QUERY_TYPE_U32:
-               break;
-       }
-
-       sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
-
-       for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
-               if (query->b.type != PIPE_QUERY_TIMESTAMP) {
-                       qbuf_prev = qbuf->previous;
-                       consts.result_count = qbuf->results_end / query->result_size;
-                       consts.config &= ~3;
-                       if (qbuf != &query->buffer)
-                               consts.config |= 1;
-                       if (qbuf->previous)
-                               consts.config |= 2;
-               } else {
-                       /* Only read the last timestamp. */
-                       qbuf_prev = NULL;
-                       consts.result_count = 0;
-                       consts.config |= 16;
-                       params.start_offset += qbuf->results_end - query->result_size;
-               }
-
-               sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
-
-               ssbo[0].buffer = &qbuf->buf->b.b;
-               ssbo[0].buffer_offset = params.start_offset;
-               ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
-
-               if (!qbuf->previous) {
-                       ssbo[2].buffer = resource;
-                       ssbo[2].buffer_offset = offset;
-                       ssbo[2].buffer_size = 8;
-
-                       si_resource(resource)->TC_L2_dirty = true;
-               }
-
-               sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo,
-                                          1 << 2);
-
-               if (wait && qbuf == &query->buffer) {
-                       uint64_t va;
-
-                       /* Wait for result availability. Wait only for readiness
-                        * of the last entry, since the fence writes should be
-                        * serialized in the CP.
-                        */
-                       va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
-                       va += params.fence_offset;
-
-                       si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x80000000,
-                                      0x80000000, WAIT_REG_MEM_EQUAL);
-               }
-
-               sctx->b.launch_grid(&sctx->b, &grid);
-               sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-       }
-
-       si_restore_qbo_state(sctx, &saved_state);
-       pipe_resource_reference(&tmp_buffer, NULL);
+   struct si_query_hw *query = (struct si_query_hw *)squery;
+   struct si_query_buffer *qbuf;
+   struct si_query_buffer *qbuf_prev;
+   struct pipe_resource *tmp_buffer = NULL;
+   unsigned tmp_buffer_offset = 0;
+   struct si_qbo_state saved_state = {};
+   struct pipe_grid_info grid = {};
+   struct pipe_constant_buffer constant_buffer = {};
+   struct pipe_shader_buffer ssbo[3];
+   struct si_hw_query_params params;
+   struct {
+      uint32_t end_offset;
+      uint32_t result_stride;
+      uint32_t result_count;
+      uint32_t config;
+      uint32_t fence_offset;
+      uint32_t pair_stride;
+      uint32_t pair_count;
+   } consts;
+
+   if (!sctx->query_result_shader) {
+      sctx->query_result_shader = si_create_query_result_cs(sctx);
+      if (!sctx->query_result_shader)
+         return;
+   }
+
+   if (query->buffer.previous) {
+      u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
+      if (!tmp_buffer)
+         return;
+   }
+
+   si_save_qbo_state(sctx, &saved_state);
+
+   si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, &params);
+   consts.end_offset = params.end_offset - params.start_offset;
+   consts.fence_offset = params.fence_offset - params.start_offset;
+   consts.result_stride = query->result_size;
+   consts.pair_stride = params.pair_stride;
+   consts.pair_count = params.pair_count;
+
+   constant_buffer.buffer_size = sizeof(consts);
+   constant_buffer.user_buffer = &consts;
+
+   ssbo[1].buffer = tmp_buffer;
+   ssbo[1].buffer_offset = tmp_buffer_offset;
+   ssbo[1].buffer_size = 16;
+
+   ssbo[2] = ssbo[1];
+
+   sctx->b.bind_compute_state(&sctx->b, sctx->query_result_shader);
+
+   grid.block[0] = 1;
+   grid.block[1] = 1;
+   grid.block[2] = 1;
+   grid.grid[0] = 1;
+   grid.grid[1] = 1;
+   grid.grid[2] = 1;
+
+   consts.config = 0;
+   if (index < 0)
+      consts.config |= 4;
+   if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+       query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
+      consts.config |= 8;
+   else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+            query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+      consts.config |= 8 | 256;
+   else if (query->b.type == PIPE_QUERY_TIMESTAMP || query->b.type == PIPE_QUERY_TIME_ELAPSED)
+      consts.config |= 32;
+
+   switch (result_type) {
+   case PIPE_QUERY_TYPE_U64:
+   case PIPE_QUERY_TYPE_I64:
+      consts.config |= 64;
+      break;
+   case PIPE_QUERY_TYPE_I32:
+      consts.config |= 128;
+      break;
+   case PIPE_QUERY_TYPE_U32:
+      break;
+   }
+
+   sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
+
+   for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
+      if (query->b.type != PIPE_QUERY_TIMESTAMP) {
+         qbuf_prev = qbuf->previous;
+         consts.result_count = qbuf->results_end / query->result_size;
+         consts.config &= ~3;
+         if (qbuf != &query->buffer)
+            consts.config |= 1;
+         if (qbuf->previous)
+            consts.config |= 2;
+      } else {
+         /* Only read the last timestamp. */
+         qbuf_prev = NULL;
+         consts.result_count = 0;
+         consts.config |= 16;
+         params.start_offset += qbuf->results_end - query->result_size;
+      }
+
+      sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+
+      ssbo[0].buffer = &qbuf->buf->b.b;
+      ssbo[0].buffer_offset = params.start_offset;
+      ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
+
+      if (!qbuf->previous) {
+         ssbo[2].buffer = resource;
+         ssbo[2].buffer_offset = offset;
+         ssbo[2].buffer_size = 8;
+
+         si_resource(resource)->TC_L2_dirty = true;
+      }
+
+      sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 1 << 2);
+
+      if (wait && qbuf == &query->buffer) {
+         uint64_t va;
+
+         /* Wait for result availability. Wait only for readiness
+          * of the last entry, since the fence writes should be
+          * serialized in the CP.
+          */
+         va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
+         va += params.fence_offset;
+
+         si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL);
+      }
+
+      sctx->b.launch_grid(&sctx->b, &grid);
+      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+   }
+
+   si_restore_qbo_state(sctx, &saved_state);
+   pipe_resource_reference(&tmp_buffer, NULL);
  }
  
-static void si_render_condition(struct pipe_context *ctx,
-                               struct pipe_query *query,
-                               bool condition,
-                               enum pipe_render_cond_flag mode)
+static void si_render_condition(struct pipe_context *ctx, struct pipe_query *query, bool condition,
+                                enum pipe_render_cond_flag mode)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_query_hw *squery = (struct si_query_hw *)query;
-       struct si_atom *atom = &sctx->atoms.s.render_cond;
-
-       if (query) {
-               bool needs_workaround = false;
-
-               /* There was a firmware regression in GFX8 which causes successive
-                * SET_PREDICATION packets to give the wrong answer for
-                * non-inverted stream overflow predication.
-                */
-               if (((sctx->chip_class == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
-                    (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
-                   !condition &&
-                   (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
-                    (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
-                     (squery->buffer.previous ||
-                      squery->buffer.results_end > squery->result_size)))) {
-                       needs_workaround = true;
-               }
-
-               if (needs_workaround && !squery->workaround_buf) {
-                       bool old_force_off = sctx->render_cond_force_off;
-                       sctx->render_cond_force_off = true;
-
-                       u_suballocator_alloc(
-                               sctx->allocator_zeroed_memory, 8, 8,
-                               &squery->workaround_offset,
-                               (struct pipe_resource **)&squery->workaround_buf);
-
-                       /* Reset to NULL to avoid a redundant SET_PREDICATION
-                        * from launching the compute grid.
-                        */
-                       sctx->render_cond = NULL;
-
-                       ctx->get_query_result_resource(
-                               ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
-                               &squery->workaround_buf->b.b, squery->workaround_offset);
-
-                       /* Settings this in the render cond atom is too late,
-                        * so set it here. */
-                       sctx->flags |= sctx->screen->barrier_flags.L2_to_cp |
-                                      SI_CONTEXT_FLUSH_FOR_RENDER_COND;
-
-                       sctx->render_cond_force_off = old_force_off;
-               }
-       }
-
-       sctx->render_cond = query;
-       sctx->render_cond_invert = condition;
-       sctx->render_cond_mode = mode;
-
-       si_set_atom_dirty(sctx, atom, query != NULL);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_query_hw *squery = (struct si_query_hw *)query;
+   struct si_atom *atom = &sctx->atoms.s.render_cond;
+
+   if (query) {
+      bool needs_workaround = false;
+
+      /* There was a firmware regression in GFX8 which causes successive
+       * SET_PREDICATION packets to give the wrong answer for
+       * non-inverted stream overflow predication.
+       */
+      if (((sctx->chip_class == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
+           (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
+          !condition &&
+          (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
+           (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
+            (squery->buffer.previous || squery->buffer.results_end > squery->result_size)))) {
+         needs_workaround = true;
+      }
+
+      if (needs_workaround && !squery->workaround_buf) {
+         bool old_force_off = sctx->render_cond_force_off;
+         sctx->render_cond_force_off = true;
+
+         u_suballocator_alloc(sctx->allocator_zeroed_memory, 8, 8, &squery->workaround_offset,
+                              (struct pipe_resource **)&squery->workaround_buf);
+
+         /* Reset to NULL to avoid a redundant SET_PREDICATION
+          * from launching the compute grid.
+          */
+         sctx->render_cond = NULL;
+
+         ctx->get_query_result_resource(ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
+                                        &squery->workaround_buf->b.b, squery->workaround_offset);
+
+         /* Settings this in the render cond atom is too late,
+          * so set it here. */
+         sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND;
+
+         sctx->render_cond_force_off = old_force_off;
+      }
+   }
+
+   sctx->render_cond = query;
+   sctx->render_cond_invert = condition;
+   sctx->render_cond_mode = mode;
+
+   si_set_atom_dirty(sctx, atom, query != NULL);
  }
  
  void si_suspend_queries(struct si_context *sctx)
  {
-       struct si_query *query;
+   struct si_query *query;
  
-       LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, active_list)
-               query->ops->suspend(sctx, query);
+   LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
+      query->ops->suspend(sctx, query);
  }
  
  void si_resume_queries(struct si_context *sctx)
  {
-       struct si_query *query;
+   struct si_query *query;
  
-       /* Check CS space here. Resuming must not be interrupted by flushes. */
-       si_need_gfx_cs_space(sctx);
+   /* Check CS space here. Resuming must not be interrupted by flushes. */
+   si_need_gfx_cs_space(sctx);
  
-       LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, active_list)
-               query->ops->resume(sctx, query);
+   LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
+      query->ops->resume(sctx, query);
  }
  
-#define XFULL(name_, query_type_, type_, result_type_, group_id_) \
-       { \
-               .name = name_, \
-               .query_type = SI_QUERY_##query_type_, \
-               .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
-               .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
-               .group_id = group_id_ \
-       }
+#define XFULL(name_, query_type_, type_, result_type_, group_id_)                                  \
+   {                                                                                               \
+      .name = name_, .query_type = SI_QUERY_##query_type_, .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
+      .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, .group_id = group_id_           \
+   }
  
-#define X(name_, query_type_, type_, result_type_) \
-       XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
+#define X(name_, query_type_, type_, result_type_)                                                 \
+   XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
  
-#define XG(group_, name_, query_type_, type_, result_type_) \
-       XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
+#define XG(group_, name_, query_type_, type_, result_type_)                                        \
+   XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
  
  static struct pipe_driver_query_info si_driver_query_list[] = {
-       X("num-compilations",           NUM_COMPILATIONS,       UINT64, CUMULATIVE),
-       X("num-shaders-created",        NUM_SHADERS_CREATED,    UINT64, CUMULATIVE),
-       X("draw-calls",                 DRAW_CALLS,             UINT64, AVERAGE),
-       X("decompress-calls",           DECOMPRESS_CALLS,       UINT64, AVERAGE),
-       X("MRT-draw-calls",             MRT_DRAW_CALLS,         UINT64, AVERAGE),
-       X("prim-restart-calls",         PRIM_RESTART_CALLS,     UINT64, AVERAGE),
-       X("spill-draw-calls",           SPILL_DRAW_CALLS,       UINT64, AVERAGE),
-       X("compute-calls",              COMPUTE_CALLS,          UINT64, AVERAGE),
-       X("spill-compute-calls",        SPILL_COMPUTE_CALLS,    UINT64, AVERAGE),
-       X("dma-calls",                  DMA_CALLS,              UINT64, AVERAGE),
-       X("cp-dma-calls",               CP_DMA_CALLS,           UINT64, AVERAGE),
-       X("num-vs-flushes",             NUM_VS_FLUSHES,         UINT64, AVERAGE),
-       X("num-ps-flushes",             NUM_PS_FLUSHES,         UINT64, AVERAGE),
-       X("num-cs-flushes",             NUM_CS_FLUSHES,         UINT64, AVERAGE),
-       X("num-CB-cache-flushes",       NUM_CB_CACHE_FLUSHES,   UINT64, AVERAGE),
-       X("num-DB-cache-flushes",       NUM_DB_CACHE_FLUSHES,   UINT64, AVERAGE),
-       X("num-L2-invalidates",         NUM_L2_INVALIDATES,     UINT64, AVERAGE),
-       X("num-L2-writebacks",          NUM_L2_WRITEBACKS,      UINT64, AVERAGE),
-       X("num-resident-handles",       NUM_RESIDENT_HANDLES,   UINT64, AVERAGE),
-       X("tc-offloaded-slots",         TC_OFFLOADED_SLOTS,     UINT64, AVERAGE),
-       X("tc-direct-slots",            TC_DIRECT_SLOTS,        UINT64, AVERAGE),
-       X("tc-num-syncs",               TC_NUM_SYNCS,           UINT64, AVERAGE),
-       X("CS-thread-busy",             CS_THREAD_BUSY,         UINT64, AVERAGE),
-       X("gallium-thread-busy",        GALLIUM_THREAD_BUSY,    UINT64, AVERAGE),
-       X("requested-VRAM",             REQUESTED_VRAM,         BYTES, AVERAGE),
-       X("requested-GTT",              REQUESTED_GTT,          BYTES, AVERAGE),
-       X("mapped-VRAM",                MAPPED_VRAM,            BYTES, AVERAGE),
-       X("mapped-GTT",                 MAPPED_GTT,             BYTES, AVERAGE),
-       X("buffer-wait-time",           BUFFER_WAIT_TIME,       MICROSECONDS, CUMULATIVE),
-       X("num-mapped-buffers",         NUM_MAPPED_BUFFERS,     UINT64, AVERAGE),
-       X("num-GFX-IBs",                NUM_GFX_IBS,            UINT64, AVERAGE),
-       X("num-SDMA-IBs",               NUM_SDMA_IBS,           UINT64, AVERAGE),
-       X("GFX-BO-list-size",           GFX_BO_LIST_SIZE,       UINT64, AVERAGE),
-       X("GFX-IB-size",                GFX_IB_SIZE,            UINT64, AVERAGE),
-       X("num-bytes-moved",            NUM_BYTES_MOVED,        BYTES, CUMULATIVE),
-       X("num-evictions",              NUM_EVICTIONS,          UINT64, CUMULATIVE),
-       X("VRAM-CPU-page-faults",       NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
-       X("VRAM-usage",                 VRAM_USAGE,             BYTES, AVERAGE),
-       X("VRAM-vis-usage",             VRAM_VIS_USAGE,         BYTES, AVERAGE),
-       X("GTT-usage",                  GTT_USAGE,              BYTES, AVERAGE),
-       X("back-buffer-ps-draw-ratio",  BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
-       X("live-shader-cache-hits",     LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
-       X("live-shader-cache-misses",   LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
-       X("memory-shader-cache-hits",   MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
-       X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
-       X("disk-shader-cache-hits",     DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
-       X("disk-shader-cache-misses",   DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
-
-       /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
-        * which use it as a fallback path to detect the GPU type.
-        *
-        * Note: The names of these queries are significant for GPUPerfStudio
-        * (and possibly their order as well). */
-       XG(GPIN, "GPIN_000",            GPIN_ASIC_ID,           UINT, AVERAGE),
-       XG(GPIN, "GPIN_001",            GPIN_NUM_SIMD,          UINT, AVERAGE),
-       XG(GPIN, "GPIN_002",            GPIN_NUM_RB,            UINT, AVERAGE),
-       XG(GPIN, "GPIN_003",            GPIN_NUM_SPI,           UINT, AVERAGE),
-       XG(GPIN, "GPIN_004",            GPIN_NUM_SE,            UINT, AVERAGE),
-
-       X("temperature",                GPU_TEMPERATURE,        UINT64, AVERAGE),
-       X("shader-clock",               CURRENT_GPU_SCLK,       HZ, AVERAGE),
-       X("memory-clock",               CURRENT_GPU_MCLK,       HZ, AVERAGE),
-
-       /* The following queries must be at the end of the list because their
-        * availability is adjusted dynamically based on the DRM version. */
-       X("GPU-load",                   GPU_LOAD,               UINT64, AVERAGE),
-       X("GPU-shaders-busy",           GPU_SHADERS_BUSY,       UINT64, AVERAGE),
-       X("GPU-ta-busy",                GPU_TA_BUSY,            UINT64, AVERAGE),
-       X("GPU-gds-busy",               GPU_GDS_BUSY,           UINT64, AVERAGE),
-       X("GPU-vgt-busy",               GPU_VGT_BUSY,           UINT64, AVERAGE),
-       X("GPU-ia-busy",                GPU_IA_BUSY,            UINT64, AVERAGE),
-       X("GPU-sx-busy",                GPU_SX_BUSY,            UINT64, AVERAGE),
-       X("GPU-wd-busy",                GPU_WD_BUSY,            UINT64, AVERAGE),
-       X("GPU-bci-busy",               GPU_BCI_BUSY,           UINT64, AVERAGE),
-       X("GPU-sc-busy",                GPU_SC_BUSY,            UINT64, AVERAGE),
-       X("GPU-pa-busy",                GPU_PA_BUSY,            UINT64, AVERAGE),
-       X("GPU-db-busy",                GPU_DB_BUSY,            UINT64, AVERAGE),
-       X("GPU-cp-busy",                GPU_CP_BUSY,            UINT64, AVERAGE),
-       X("GPU-cb-busy",                GPU_CB_BUSY,            UINT64, AVERAGE),
-
-       /* SRBM_STATUS2 */
-       X("GPU-sdma-busy",              GPU_SDMA_BUSY,          UINT64, AVERAGE),
-
-       /* CP_STAT */
-       X("GPU-pfp-busy",               GPU_PFP_BUSY,           UINT64, AVERAGE),
-       X("GPU-meq-busy",               GPU_MEQ_BUSY,           UINT64, AVERAGE),
-       X("GPU-me-busy",                GPU_ME_BUSY,            UINT64, AVERAGE),
-       X("GPU-surf-sync-busy",         GPU_SURF_SYNC_BUSY,     UINT64, AVERAGE),
-       X("GPU-cp-dma-busy",            GPU_CP_DMA_BUSY,        UINT64, AVERAGE),
-       X("GPU-scratch-ram-busy",       GPU_SCRATCH_RAM_BUSY,   UINT64, AVERAGE),
-
-       X("pd-num-prims-accepted",      PD_NUM_PRIMS_ACCEPTED,  UINT64, AVERAGE),
-       X("pd-num-prims-rejected",      PD_NUM_PRIMS_REJECTED,  UINT64, AVERAGE),
-       X("pd-num-prims-ineligible",    PD_NUM_PRIMS_INELIGIBLE,UINT64, AVERAGE),
+   X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
+   X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
+   X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
+   X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
+   X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE),
+   X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
+   X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE),
+   X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
+   X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE),
+   X("dma-calls", DMA_CALLS, UINT64, AVERAGE),
+   X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
+   X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
+   X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
+   X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
+   X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
+   X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
+   X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
+   X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
+   X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
+   X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
+   X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
+   X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
+   X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
+   X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
+   X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
+   X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
+   X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
+   X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
+   X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
+   X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
+   X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
+   X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE),
+   X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
+   X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
+   X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
+   X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
+   X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
+   X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
+   X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
+   X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
+   X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
+   X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
+   X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
+   X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
+   X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
+   X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
+   X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
+
+   /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
+    * which use it as a fallback path to detect the GPU type.
+    *
+    * Note: The names of these queries are significant for GPUPerfStudio
+    * (and possibly their order as well). */
+   XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
+   XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
+   XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
+   XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
+   XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
+
+   X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
+   X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
+   X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
+
+   /* The following queries must be at the end of the list because their
+    * availability is adjusted dynamically based on the DRM version. */
+   X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
+   X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
+   X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
+   X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
+   X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
+   X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
+   X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
+   X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
+   X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
+   X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
+   X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
+   X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
+   X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
+   X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
+
+   /* SRBM_STATUS2 */
+   X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
+
+   /* CP_STAT */
+   X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
+   X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
+   X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
+   X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
+   X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
+   X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
+
+   X("pd-num-prims-accepted", PD_NUM_PRIMS_ACCEPTED, UINT64, AVERAGE),
+   X("pd-num-prims-rejected", PD_NUM_PRIMS_REJECTED, UINT64, AVERAGE),
+   X("pd-num-prims-ineligible", PD_NUM_PRIMS_INELIGIBLE, UINT64, AVERAGE),
  };
  
  #undef X
@@ -1848,119 +1764,116 @@ static struct pipe_driver_query_info si_driver_query_list[] = {
  
  static unsigned si_get_num_queries(struct si_screen *sscreen)
  {
-       /* amdgpu */
-       if (sscreen->info.is_amdgpu) {
-               if (sscreen->info.chip_class >= GFX8)
-                       return ARRAY_SIZE(si_driver_query_list);
-               else
-                       return ARRAY_SIZE(si_driver_query_list) - 7;
-       }
-
-       /* radeon */
-       if (sscreen->info.has_read_registers_query) {
-               if (sscreen->info.chip_class == GFX7)
-                       return ARRAY_SIZE(si_driver_query_list) - 6;
-               else
-                       return ARRAY_SIZE(si_driver_query_list) - 7;
-       }
-
-       return ARRAY_SIZE(si_driver_query_list) - 21;
+   /* amdgpu */
+   if (sscreen->info.is_amdgpu) {
+      if (sscreen->info.chip_class >= GFX8)
+         return ARRAY_SIZE(si_driver_query_list);
+      else
+         return ARRAY_SIZE(si_driver_query_list) - 7;
+   }
+
+   /* radeon */
+   if (sscreen->info.has_read_registers_query) {
+      if (sscreen->info.chip_class == GFX7)
+         return ARRAY_SIZE(si_driver_query_list) - 6;
+      else
+         return ARRAY_SIZE(si_driver_query_list) - 7;
+   }
+
+   return ARRAY_SIZE(si_driver_query_list) - 21;
  }
  
-static int si_get_driver_query_info(struct pipe_screen *screen,
-                                   unsigned index,
-                                   struct pipe_driver_query_info *info)
+static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index,
+                                    struct pipe_driver_query_info *info)
  {
-       struct si_screen *sscreen = (struct si_screen*)screen;
-       unsigned num_queries = si_get_num_queries(sscreen);
-
-       if (!info) {
-               unsigned num_perfcounters =
-                       si_get_perfcounter_info(sscreen, 0, NULL);
-
-               return num_queries + num_perfcounters;
-       }
-
-       if (index >= num_queries)
-               return si_get_perfcounter_info(sscreen, index - num_queries, info);
-
-       *info = si_driver_query_list[index];
-
-       switch (info->query_type) {
-       case SI_QUERY_REQUESTED_VRAM:
-       case SI_QUERY_VRAM_USAGE:
-       case SI_QUERY_MAPPED_VRAM:
-               info->max_value.u64 = sscreen->info.vram_size;
-               break;
-       case SI_QUERY_REQUESTED_GTT:
-       case SI_QUERY_GTT_USAGE:
-       case SI_QUERY_MAPPED_GTT:
-               info->max_value.u64 = sscreen->info.gart_size;
-               break;
-       case SI_QUERY_GPU_TEMPERATURE:
-               info->max_value.u64 = 125;
-               break;
-       case SI_QUERY_VRAM_VIS_USAGE:
-               info->max_value.u64 = sscreen->info.vram_vis_size;
-               break;
-       }
-
-       if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
-               info->group_id += sscreen->perfcounters->num_groups;
-
-       return 1;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   unsigned num_queries = si_get_num_queries(sscreen);
+
+   if (!info) {
+      unsigned num_perfcounters = si_get_perfcounter_info(sscreen, 0, NULL);
+
+      return num_queries + num_perfcounters;
+   }
+
+   if (index >= num_queries)
+      return si_get_perfcounter_info(sscreen, index - num_queries, info);
+
+   *info = si_driver_query_list[index];
+
+   switch (info->query_type) {
+   case SI_QUERY_REQUESTED_VRAM:
+   case SI_QUERY_VRAM_USAGE:
+   case SI_QUERY_MAPPED_VRAM:
+      info->max_value.u64 = sscreen->info.vram_size;
+      break;
+   case SI_QUERY_REQUESTED_GTT:
+   case SI_QUERY_GTT_USAGE:
+   case SI_QUERY_MAPPED_GTT:
+      info->max_value.u64 = sscreen->info.gart_size;
+      break;
+   case SI_QUERY_GPU_TEMPERATURE:
+      info->max_value.u64 = 125;
+      break;
+   case SI_QUERY_VRAM_VIS_USAGE:
+      info->max_value.u64 = sscreen->info.vram_vis_size;
+      break;
+   }
+
+   if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
+      info->group_id += sscreen->perfcounters->num_groups;
+
+   return 1;
  }
  
  /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
   * performance counter groups, so be careful when changing this and related
   * functions.
   */
-static int si_get_driver_query_group_info(struct pipe_screen *screen,
-                                         unsigned index,
-                                         struct pipe_driver_query_group_info *info)
+static int si_get_driver_query_group_info(struct pipe_screen *screen, unsigned index,
+                                          struct pipe_driver_query_group_info *info)
  {
-       struct si_screen *sscreen = (struct si_screen *)screen;
-       unsigned num_pc_groups = 0;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   unsigned num_pc_groups = 0;
  
-       if (sscreen->perfcounters)
-               num_pc_groups = sscreen->perfcounters->num_groups;
+   if (sscreen->perfcounters)
+      num_pc_groups = sscreen->perfcounters->num_groups;
  
-       if (!info)
-               return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
+   if (!info)
+      return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
  
-       if (index < num_pc_groups)
-               return si_get_perfcounter_group_info(sscreen, index, info);
+   if (index < num_pc_groups)
+      return si_get_perfcounter_group_info(sscreen, index, info);
  
-       index -= num_pc_groups;
-       if (index >= SI_NUM_SW_QUERY_GROUPS)
-               return 0;
+   index -= num_pc_groups;
+   if (index >= SI_NUM_SW_QUERY_GROUPS)
+      return 0;
  
-       info->name = "GPIN";
-       info->max_active_queries = 5;
-       info->num_queries = 5;
-       return 1;
+   info->name = "GPIN";
+   info->max_active_queries = 5;
+   info->num_queries = 5;
+   return 1;
  }
  
  void si_init_query_functions(struct si_context *sctx)
  {
-       sctx->b.create_query = si_create_query;
-       sctx->b.create_batch_query = si_create_batch_query;
-       sctx->b.destroy_query = si_destroy_query;
-       sctx->b.begin_query = si_begin_query;
-       sctx->b.end_query = si_end_query;
-       sctx->b.get_query_result = si_get_query_result;
-       sctx->b.get_query_result_resource = si_get_query_result_resource;
-
-       if (sctx->has_graphics) {
-               sctx->atoms.s.render_cond.emit = si_emit_query_predication;
-               sctx->b.render_condition = si_render_condition;
-       }
-
-       list_inithead(&sctx->active_queries);
+   sctx->b.create_query = si_create_query;
+   sctx->b.create_batch_query = si_create_batch_query;
+   sctx->b.destroy_query = si_destroy_query;
+   sctx->b.begin_query = si_begin_query;
+   sctx->b.end_query = si_end_query;
+   sctx->b.get_query_result = si_get_query_result;
+   sctx->b.get_query_result_resource = si_get_query_result_resource;
+
+   if (sctx->has_graphics) {
+      sctx->atoms.s.render_cond.emit = si_emit_query_predication;
+      sctx->b.render_condition = si_render_condition;
+   }
+
+   list_inithead(&sctx->active_queries);
  }
  
  void si_init_screen_query_functions(struct si_screen *sscreen)
  {
-       sscreen->b.get_driver_query_info = si_get_driver_query_info;
-       sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
+   sscreen->b.get_driver_query_info = si_get_driver_query_info;
+   sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
  }
diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h

index 6c4386451cceb40c857c8d0bb0c4847ba51e1a56..1eaa3b255a612cdd799351a3a764bfef09ad12ca 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -40,236 +40,220 @@ struct si_resource;
  
  #define SI_MAX_STREAMS 4
  
-enum {
-       SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
-       SI_QUERY_DECOMPRESS_CALLS,
-       SI_QUERY_MRT_DRAW_CALLS,
-       SI_QUERY_PRIM_RESTART_CALLS,
-       SI_QUERY_SPILL_DRAW_CALLS,
-       SI_QUERY_COMPUTE_CALLS,
-       SI_QUERY_SPILL_COMPUTE_CALLS,
-       SI_QUERY_DMA_CALLS,
-       SI_QUERY_CP_DMA_CALLS,
-       SI_QUERY_NUM_VS_FLUSHES,
-       SI_QUERY_NUM_PS_FLUSHES,
-       SI_QUERY_NUM_CS_FLUSHES,
-       SI_QUERY_NUM_CB_CACHE_FLUSHES,
-       SI_QUERY_NUM_DB_CACHE_FLUSHES,
-       SI_QUERY_NUM_L2_INVALIDATES,
-       SI_QUERY_NUM_L2_WRITEBACKS,
-       SI_QUERY_NUM_RESIDENT_HANDLES,
-       SI_QUERY_TC_OFFLOADED_SLOTS,
-       SI_QUERY_TC_DIRECT_SLOTS,
-       SI_QUERY_TC_NUM_SYNCS,
-       SI_QUERY_CS_THREAD_BUSY,
-       SI_QUERY_GALLIUM_THREAD_BUSY,
-       SI_QUERY_REQUESTED_VRAM,
-       SI_QUERY_REQUESTED_GTT,
-       SI_QUERY_MAPPED_VRAM,
-       SI_QUERY_MAPPED_GTT,
-       SI_QUERY_BUFFER_WAIT_TIME,
-       SI_QUERY_NUM_MAPPED_BUFFERS,
-       SI_QUERY_NUM_GFX_IBS,
-       SI_QUERY_NUM_SDMA_IBS,
-       SI_QUERY_GFX_BO_LIST_SIZE,
-       SI_QUERY_GFX_IB_SIZE,
-       SI_QUERY_NUM_BYTES_MOVED,
-       SI_QUERY_NUM_EVICTIONS,
-       SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS,
-       SI_QUERY_VRAM_USAGE,
-       SI_QUERY_VRAM_VIS_USAGE,
-       SI_QUERY_GTT_USAGE,
-       SI_QUERY_GPU_TEMPERATURE,
-       SI_QUERY_CURRENT_GPU_SCLK,
-       SI_QUERY_CURRENT_GPU_MCLK,
-       SI_QUERY_GPU_LOAD,
-       SI_QUERY_GPU_SHADERS_BUSY,
-       SI_QUERY_GPU_TA_BUSY,
-       SI_QUERY_GPU_GDS_BUSY,
-       SI_QUERY_GPU_VGT_BUSY,
-       SI_QUERY_GPU_IA_BUSY,
-       SI_QUERY_GPU_SX_BUSY,
-       SI_QUERY_GPU_WD_BUSY,
-       SI_QUERY_GPU_BCI_BUSY,
-       SI_QUERY_GPU_SC_BUSY,
-       SI_QUERY_GPU_PA_BUSY,
-       SI_QUERY_GPU_DB_BUSY,
-       SI_QUERY_GPU_CP_BUSY,
-       SI_QUERY_GPU_CB_BUSY,
-       SI_QUERY_GPU_SDMA_BUSY,
-       SI_QUERY_GPU_PFP_BUSY,
-       SI_QUERY_GPU_MEQ_BUSY,
-       SI_QUERY_GPU_ME_BUSY,
-       SI_QUERY_GPU_SURF_SYNC_BUSY,
-       SI_QUERY_GPU_CP_DMA_BUSY,
-       SI_QUERY_GPU_SCRATCH_RAM_BUSY,
-       SI_QUERY_NUM_COMPILATIONS,
-       SI_QUERY_NUM_SHADERS_CREATED,
-       SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
-       SI_QUERY_GPIN_ASIC_ID,
-       SI_QUERY_GPIN_NUM_SIMD,
-       SI_QUERY_GPIN_NUM_RB,
-       SI_QUERY_GPIN_NUM_SPI,
-       SI_QUERY_GPIN_NUM_SE,
-       SI_QUERY_TIME_ELAPSED_SDMA,
-       SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */
-       SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
-       SI_QUERY_PD_NUM_PRIMS_REJECTED,
-       SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
-       SI_QUERY_LIVE_SHADER_CACHE_HITS,
-       SI_QUERY_LIVE_SHADER_CACHE_MISSES,
-       SI_QUERY_MEMORY_SHADER_CACHE_HITS,
-       SI_QUERY_MEMORY_SHADER_CACHE_MISSES,
-       SI_QUERY_DISK_SHADER_CACHE_HITS,
-       SI_QUERY_DISK_SHADER_CACHE_MISSES,
-
-       SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
+enum
+{
+   SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
+   SI_QUERY_DECOMPRESS_CALLS,
+   SI_QUERY_MRT_DRAW_CALLS,
+   SI_QUERY_PRIM_RESTART_CALLS,
+   SI_QUERY_SPILL_DRAW_CALLS,
+   SI_QUERY_COMPUTE_CALLS,
+   SI_QUERY_SPILL_COMPUTE_CALLS,
+   SI_QUERY_DMA_CALLS,
+   SI_QUERY_CP_DMA_CALLS,
+   SI_QUERY_NUM_VS_FLUSHES,
+   SI_QUERY_NUM_PS_FLUSHES,
+   SI_QUERY_NUM_CS_FLUSHES,
+   SI_QUERY_NUM_CB_CACHE_FLUSHES,
+   SI_QUERY_NUM_DB_CACHE_FLUSHES,
+   SI_QUERY_NUM_L2_INVALIDATES,
+   SI_QUERY_NUM_L2_WRITEBACKS,
+   SI_QUERY_NUM_RESIDENT_HANDLES,
+   SI_QUERY_TC_OFFLOADED_SLOTS,
+   SI_QUERY_TC_DIRECT_SLOTS,
+   SI_QUERY_TC_NUM_SYNCS,
+   SI_QUERY_CS_THREAD_BUSY,
+   SI_QUERY_GALLIUM_THREAD_BUSY,
+   SI_QUERY_REQUESTED_VRAM,
+   SI_QUERY_REQUESTED_GTT,
+   SI_QUERY_MAPPED_VRAM,
+   SI_QUERY_MAPPED_GTT,
+   SI_QUERY_BUFFER_WAIT_TIME,
+   SI_QUERY_NUM_MAPPED_BUFFERS,
+   SI_QUERY_NUM_GFX_IBS,
+   SI_QUERY_NUM_SDMA_IBS,
+   SI_QUERY_GFX_BO_LIST_SIZE,
+   SI_QUERY_GFX_IB_SIZE,
+   SI_QUERY_NUM_BYTES_MOVED,
+   SI_QUERY_NUM_EVICTIONS,
+   SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS,
+   SI_QUERY_VRAM_USAGE,
+   SI_QUERY_VRAM_VIS_USAGE,
+   SI_QUERY_GTT_USAGE,
+   SI_QUERY_GPU_TEMPERATURE,
+   SI_QUERY_CURRENT_GPU_SCLK,
+   SI_QUERY_CURRENT_GPU_MCLK,
+   SI_QUERY_GPU_LOAD,
+   SI_QUERY_GPU_SHADERS_BUSY,
+   SI_QUERY_GPU_TA_BUSY,
+   SI_QUERY_GPU_GDS_BUSY,
+   SI_QUERY_GPU_VGT_BUSY,
+   SI_QUERY_GPU_IA_BUSY,
+   SI_QUERY_GPU_SX_BUSY,
+   SI_QUERY_GPU_WD_BUSY,
+   SI_QUERY_GPU_BCI_BUSY,
+   SI_QUERY_GPU_SC_BUSY,
+   SI_QUERY_GPU_PA_BUSY,
+   SI_QUERY_GPU_DB_BUSY,
+   SI_QUERY_GPU_CP_BUSY,
+   SI_QUERY_GPU_CB_BUSY,
+   SI_QUERY_GPU_SDMA_BUSY,
+   SI_QUERY_GPU_PFP_BUSY,
+   SI_QUERY_GPU_MEQ_BUSY,
+   SI_QUERY_GPU_ME_BUSY,
+   SI_QUERY_GPU_SURF_SYNC_BUSY,
+   SI_QUERY_GPU_CP_DMA_BUSY,
+   SI_QUERY_GPU_SCRATCH_RAM_BUSY,
+   SI_QUERY_NUM_COMPILATIONS,
+   SI_QUERY_NUM_SHADERS_CREATED,
+   SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
+   SI_QUERY_GPIN_ASIC_ID,
+   SI_QUERY_GPIN_NUM_SIMD,
+   SI_QUERY_GPIN_NUM_RB,
+   SI_QUERY_GPIN_NUM_SPI,
+   SI_QUERY_GPIN_NUM_SE,
+   SI_QUERY_TIME_ELAPSED_SDMA,
+   SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */
+   SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
+   SI_QUERY_PD_NUM_PRIMS_REJECTED,
+   SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
+   SI_QUERY_LIVE_SHADER_CACHE_HITS,
+   SI_QUERY_LIVE_SHADER_CACHE_MISSES,
+   SI_QUERY_MEMORY_SHADER_CACHE_HITS,
+   SI_QUERY_MEMORY_SHADER_CACHE_MISSES,
+   SI_QUERY_DISK_SHADER_CACHE_HITS,
+   SI_QUERY_DISK_SHADER_CACHE_MISSES,
+
+   SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
  };
  
-enum {
-       SI_QUERY_GROUP_GPIN = 0,
-       SI_NUM_SW_QUERY_GROUPS
+enum
+{
+   SI_QUERY_GROUP_GPIN = 0,
+   SI_NUM_SW_QUERY_GROUPS
  };
  
  struct si_query_ops {
-       void (*destroy)(struct si_context *, struct si_query *);
-       bool (*begin)(struct si_context *, struct si_query *);
-       bool (*end)(struct si_context *, struct si_query *);
-       bool (*get_result)(struct si_context *,
-                          struct si_query *, bool wait,
-                          union pipe_query_result *result);
-       void (*get_result_resource)(struct si_context *,
-                                   struct si_query *, bool wait,
-                                   enum pipe_query_value_type result_type,
-                                   int index,
-                                   struct pipe_resource *resource,
-                                   unsigned offset);
-
-       void (*suspend)(struct si_context *, struct si_query *);
-       void (*resume)(struct si_context *, struct si_query *);
+   void (*destroy)(struct si_context *, struct si_query *);
+   bool (*begin)(struct si_context *, struct si_query *);
+   bool (*end)(struct si_context *, struct si_query *);
+   bool (*get_result)(struct si_context *, struct si_query *, bool wait,
+                      union pipe_query_result *result);
+   void (*get_result_resource)(struct si_context *, struct si_query *, bool wait,
+                               enum pipe_query_value_type result_type, int index,
+                               struct pipe_resource *resource, unsigned offset);
+
+   void (*suspend)(struct si_context *, struct si_query *);
+   void (*resume)(struct si_context *, struct si_query *);
  };
  
  struct si_query {
-       struct threaded_query b;
-       const struct si_query_ops *ops;
+   struct threaded_query b;
+   const struct si_query_ops *ops;
  
-       /* The PIPE_QUERY_xxx type of query */
-       unsigned type;
+   /* The PIPE_QUERY_xxx type of query */
+   unsigned type;
  
-       /* The number of dwords for suspend. */
-       unsigned num_cs_dw_suspend;
+   /* The number of dwords for suspend. */
+   unsigned num_cs_dw_suspend;
  
-       /* Linked list of queries that must be suspended at end of CS. */
-       struct list_head active_list;
+   /* Linked list of queries that must be suspended at end of CS. */
+   struct list_head active_list;
  };
  
-enum {
-       SI_QUERY_HW_FLAG_NO_START = (1 << 0),
-       /* gap */
-       /* whether begin_query doesn't clear the result */
-       SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
+enum
+{
+   SI_QUERY_HW_FLAG_NO_START = (1 << 0),
+   /* gap */
+   /* whether begin_query doesn't clear the result */
+   SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
  };
  
  struct si_query_hw_ops {
-       bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *);
-       void (*emit_start)(struct si_context *,
-                          struct si_query_hw *,
-                          struct si_resource *buffer, uint64_t va);
-       void (*emit_stop)(struct si_context *,
-                         struct si_query_hw *,
-                         struct si_resource *buffer, uint64_t va);
-       void (*clear_result)(struct si_query_hw *, union pipe_query_result *);
-       void (*add_result)(struct si_screen *screen,
-                          struct si_query_hw *, void *buffer,
-                          union pipe_query_result *result);
+   bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *);
+   void (*emit_start)(struct si_context *, struct si_query_hw *, struct si_resource *buffer,
+                      uint64_t va);
+   void (*emit_stop)(struct si_context *, struct si_query_hw *, struct si_resource *buffer,
+                     uint64_t va);
+   void (*clear_result)(struct si_query_hw *, union pipe_query_result *);
+   void (*add_result)(struct si_screen *screen, struct si_query_hw *, void *buffer,
+                      union pipe_query_result *result);
  };
  
  struct si_query_buffer {
-       /* The buffer where query results are stored. */
-       struct si_resource              *buf;
-       /* If a query buffer is full, a new buffer is created and the old one
-        * is put in here. When we calculate the result, we sum up the samples
-        * from all buffers. */
-       struct si_query_buffer  *previous;
-       /* Offset of the next free result after current query data */
-       unsigned                        results_end;
-       bool unprepared;
+   /* The buffer where query results are stored. */
+   struct si_resource *buf;
+   /* If a query buffer is full, a new buffer is created and the old one
+    * is put in here. When we calculate the result, we sum up the samples
+    * from all buffers. */
+   struct si_query_buffer *previous;
+   /* Offset of the next free result after current query data */
+   unsigned results_end;
+   bool unprepared;
  };
  
  void si_query_buffer_destroy(struct si_screen *sctx, struct si_query_buffer *buffer);
  void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer);
  bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
-                          bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*),
-                          unsigned size);
-
+                           bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
+                           unsigned size);
  
  struct si_query_hw {
-       struct si_query b;
-       struct si_query_hw_ops *ops;
-       unsigned flags;
-
-       /* The query buffer and how many results are in it. */
-       struct si_query_buffer buffer;
-       /* Size of the result in memory for both begin_query and end_query,
-        * this can be one or two numbers, or it could even be a size of a structure. */
-       unsigned result_size;
-       /* For transform feedback: which stream the query is for */
-       unsigned stream;
-
-       /* Workaround via compute shader */
-       struct si_resource *workaround_buf;
-       unsigned workaround_offset;
+   struct si_query b;
+   struct si_query_hw_ops *ops;
+   unsigned flags;
+
+   /* The query buffer and how many results are in it. */
+   struct si_query_buffer buffer;
+   /* Size of the result in memory for both begin_query and end_query,
+    * this can be one or two numbers, or it could even be a size of a structure. */
+   unsigned result_size;
+   /* For transform feedback: which stream the query is for */
+   unsigned stream;
+
+   /* Workaround via compute shader */
+   struct si_resource *workaround_buf;
+   unsigned workaround_offset;
  };
  
-void si_query_hw_destroy(struct si_context *sctx,
-                        struct si_query *squery);
-bool si_query_hw_begin(struct si_context *sctx,
-                      struct si_query *squery);
-bool si_query_hw_end(struct si_context *sctx,
-                    struct si_query *squery);
-bool si_query_hw_get_result(struct si_context *sctx,
-                           struct si_query *squery,
-                           bool wait,
-                           union pipe_query_result *result);
+void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery);
+bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery);
+bool si_query_hw_end(struct si_context *sctx, struct si_query *squery);
+bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
+                            union pipe_query_result *result);
  void si_query_hw_suspend(struct si_context *sctx, struct si_query *query);
  void si_query_hw_resume(struct si_context *sctx, struct si_query *query);
  
-
  /* Shader-based queries */
-struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
-                                        enum pipe_query_type query_type,
-                                        unsigned index);
-
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
+                                         unsigned index);
  
  /* Performance counters */
  struct si_perfcounters {
-       unsigned num_groups;
-       unsigned num_blocks;
-       struct si_pc_block *blocks;
+   unsigned num_groups;
+   unsigned num_blocks;
+   struct si_pc_block *blocks;
  
-       unsigned num_stop_cs_dwords;
-       unsigned num_instance_cs_dwords;
+   unsigned num_stop_cs_dwords;
+   unsigned num_instance_cs_dwords;
  
-       bool separate_se;
-       bool separate_instance;
+   bool separate_se;
+   bool separate_instance;
  };
  
-struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
-                                        unsigned num_queries,
-                                        unsigned *query_types);
+struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
+                                         unsigned *query_types);
  
-int si_get_perfcounter_info(struct si_screen *,
-                           unsigned index,
-                           struct pipe_driver_query_info *info);
-int si_get_perfcounter_group_info(struct si_screen *,
-                                 unsigned index,
-                                 struct pipe_driver_query_group_info *info);
+int si_get_perfcounter_info(struct si_screen *, unsigned index,
+                            struct pipe_driver_query_info *info);
+int si_get_perfcounter_group_info(struct si_screen *, unsigned index,
+                                  struct pipe_driver_query_group_info *info);
  
  struct si_qbo_state {
-       void *saved_compute;
-       struct pipe_constant_buffer saved_const0;
-       struct pipe_shader_buffer saved_ssbo[3];
-       unsigned saved_ssbo_writable_mask;
+   void *saved_compute;
+   struct pipe_constant_buffer saved_const0;
+   struct pipe_shader_buffer saved_ssbo[3];
+   unsigned saved_ssbo_writable_mask;
  };
  
  #endif /* SI_QUERY_H */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c

index f0e60087dbfc5ae510ee79869ad9ba2d207af3a5..e615b81c29394109614458e355d5e3e10d197183 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -22,43 +22,38 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "util/u_memory.h"
-#include "tgsi/tgsi_strings.h"
-#include "tgsi/tgsi_from_mesa.h"
-
  #include "ac_exp_param.h"
  #include "ac_rtld.h"
-#include "si_shader_internal.h"
-#include "si_pipe.h"
-#include "sid.h"
-
  #include "compiler/nir/nir.h"
  #include "compiler/nir/nir_serialize.h"
+#include "si_pipe.h"
+#include "si_shader_internal.h"
+#include "sid.h"
+#include "tgsi/tgsi_from_mesa.h"
+#include "tgsi/tgsi_strings.h"
+#include "util/u_memory.h"
  
-static const char scratch_rsrc_dword0_symbol[] =
-       "SCRATCH_RSRC_DWORD0";
+static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
  
-static const char scratch_rsrc_dword1_symbol[] =
-       "SCRATCH_RSRC_DWORD1";
+static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
  
  static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
  
  /** Whether the shader runs as a combination of multiple API shaders */
  bool si_is_multi_part_shader(struct si_shader *shader)
  {
-       if (shader->selector->screen->info.chip_class <= GFX8)
-               return false;
+   if (shader->selector->screen->info.chip_class <= GFX8)
+      return false;
  
-       return shader->key.as_ls ||
-              shader->key.as_es ||
-              shader->selector->type == PIPE_SHADER_TESS_CTRL ||
-              shader->selector->type == PIPE_SHADER_GEOMETRY;
+   return shader->key.as_ls || shader->key.as_es ||
+          shader->selector->type == PIPE_SHADER_TESS_CTRL ||
+          shader->selector->type == PIPE_SHADER_GEOMETRY;
  }
  
  /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
  bool si_is_merged_shader(struct si_shader *shader)
  {
-       return shader->key.as_ngg || si_is_multi_part_shader(shader);
+   return shader->key.as_ngg || si_is_multi_part_shader(shader);
  }
  
  /**
@@ -68,19 +63,19 @@ bool si_is_merged_shader(struct si_shader *shader)
   */
  unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
  {
-       switch (semantic_name) {
-       case TGSI_SEMANTIC_TESSOUTER:
-               return 0;
-       case TGSI_SEMANTIC_TESSINNER:
-               return 1;
-       case TGSI_SEMANTIC_PATCH:
-               assert(index < 30);
-               return 2 + index;
-
-       default:
-               assert(!"invalid semantic name");
-               return 0;
-       }
+   switch (semantic_name) {
+   case TGSI_SEMANTIC_TESSOUTER:
+      return 0;
+   case TGSI_SEMANTIC_TESSINNER:
+      return 1;
+   case TGSI_SEMANTIC_PATCH:
+      assert(index < 30);
+      return 2 + index;
+
+   default:
+      assert(!"invalid semantic name");
+      return 0;
+   }
  }
  
  /**
@@ -88,1527 +83,1420 @@ unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned in
   * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
   * calculated.
   */
-unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
-                                      unsigned is_varying)
+unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, unsigned is_varying)
  {
-       switch (semantic_name) {
-       case TGSI_SEMANTIC_POSITION:
-               return 0;
-       case TGSI_SEMANTIC_GENERIC:
-               /* Since some shader stages use the the highest used IO index
-                * to determine the size to allocate for inputs/outputs
-                * (in LDS, tess and GS rings). GENERIC should be placed right
-                * after POSITION to make that size as small as possible.
-                */
-               if (index < SI_MAX_IO_GENERIC)
-                       return 1 + index;
-
-               assert(!"invalid generic index");
-               return 0;
-       case TGSI_SEMANTIC_FOG:
-               return SI_MAX_IO_GENERIC + 1;
-       case TGSI_SEMANTIC_COLOR:
-               assert(index < 2);
-               return SI_MAX_IO_GENERIC + 2 + index;
-       case TGSI_SEMANTIC_BCOLOR:
-               assert(index < 2);
-               /* If it's a varying, COLOR and BCOLOR alias. */
-               if (is_varying)
-                       return SI_MAX_IO_GENERIC + 2 + index;
-               else
-                       return SI_MAX_IO_GENERIC + 4 + index;
-       case TGSI_SEMANTIC_TEXCOORD:
-               assert(index < 8);
-               return SI_MAX_IO_GENERIC + 6 + index;
-
-       /* These are rarely used between LS and HS or ES and GS. */
-       case TGSI_SEMANTIC_CLIPDIST:
-               assert(index < 2);
-               return SI_MAX_IO_GENERIC + 6 + 8 + index;
-       case TGSI_SEMANTIC_CLIPVERTEX:
-               return SI_MAX_IO_GENERIC + 6 + 8 + 2;
-       case TGSI_SEMANTIC_PSIZE:
-               return SI_MAX_IO_GENERIC + 6 + 8 + 3;
-
-       /* These can't be written by LS, HS, and ES. */
-       case TGSI_SEMANTIC_LAYER:
-               return SI_MAX_IO_GENERIC + 6 + 8 + 4;
-       case TGSI_SEMANTIC_VIEWPORT_INDEX:
-               return SI_MAX_IO_GENERIC + 6 + 8 + 5;
-       case TGSI_SEMANTIC_PRIMID:
-               STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
-               return SI_MAX_IO_GENERIC + 6 + 8 + 6;
-       default:
-               fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
-               assert(!"invalid semantic name");
-               return 0;
-       }
+   switch (semantic_name) {
+   case TGSI_SEMANTIC_POSITION:
+      return 0;
+   case TGSI_SEMANTIC_GENERIC:
+      /* Since some shader stages use the the highest used IO index
+       * to determine the size to allocate for inputs/outputs
+       * (in LDS, tess and GS rings). GENERIC should be placed right
+       * after POSITION to make that size as small as possible.
+       */
+      if (index < SI_MAX_IO_GENERIC)
+         return 1 + index;
+
+      assert(!"invalid generic index");
+      return 0;
+   case TGSI_SEMANTIC_FOG:
+      return SI_MAX_IO_GENERIC + 1;
+   case TGSI_SEMANTIC_COLOR:
+      assert(index < 2);
+      return SI_MAX_IO_GENERIC + 2 + index;
+   case TGSI_SEMANTIC_BCOLOR:
+      assert(index < 2);
+      /* If it's a varying, COLOR and BCOLOR alias. */
+      if (is_varying)
+         return SI_MAX_IO_GENERIC + 2 + index;
+      else
+         return SI_MAX_IO_GENERIC + 4 + index;
+   case TGSI_SEMANTIC_TEXCOORD:
+      assert(index < 8);
+      return SI_MAX_IO_GENERIC + 6 + index;
+
+   /* These are rarely used between LS and HS or ES and GS. */
+   case TGSI_SEMANTIC_CLIPDIST:
+      assert(index < 2);
+      return SI_MAX_IO_GENERIC + 6 + 8 + index;
+   case TGSI_SEMANTIC_CLIPVERTEX:
+      return SI_MAX_IO_GENERIC + 6 + 8 + 2;
+   case TGSI_SEMANTIC_PSIZE:
+      return SI_MAX_IO_GENERIC + 6 + 8 + 3;
+
+   /* These can't be written by LS, HS, and ES. */
+   case TGSI_SEMANTIC_LAYER:
+      return SI_MAX_IO_GENERIC + 6 + 8 + 4;
+   case TGSI_SEMANTIC_VIEWPORT_INDEX:
+      return SI_MAX_IO_GENERIC + 6 + 8 + 5;
+   case TGSI_SEMANTIC_PRIMID:
+      STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
+      return SI_MAX_IO_GENERIC + 6 + 8 + 6;
+   default:
+      fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
+      assert(!"invalid semantic name");
+      return 0;
+   }
  }
  
  static void si_dump_streamout(struct pipe_stream_output_info *so)
  {
-       unsigned i;
-
-       if (so->num_outputs)
-               fprintf(stderr, "STREAMOUT\n");
-
-       for (i = 0; i < so->num_outputs; i++) {
-               unsigned mask = ((1 << so->output[i].num_components) - 1) <<
-                               so->output[i].start_component;
-               fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
-                       i, so->output[i].output_buffer,
-                       so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
-                       so->output[i].register_index,
-                       mask & 1 ? "x" : "",
-                       mask & 2 ? "y" : "",
-                       mask & 4 ? "z" : "",
-                       mask & 8 ? "w" : "");
-       }
+   unsigned i;
+
+   if (so->num_outputs)
+      fprintf(stderr, "STREAMOUT\n");
+
+   for (i = 0; i < so->num_outputs; i++) {
+      unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
+      fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", i, so->output[i].output_buffer,
+              so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
+              so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "",
+              mask & 4 ? "z" : "", mask & 8 ? "w" : "");
+   }
  }
  
  static void declare_streamout_params(struct si_shader_context *ctx,
-                                    struct pipe_stream_output_info *so)
+                                     struct pipe_stream_output_info *so)
  {
-       if (ctx->screen->use_ngg_streamout) {
-               if (ctx->type == PIPE_SHADER_TESS_EVAL)
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               return;
-       }
-
-       /* Streamout SGPRs. */
-       if (so->num_outputs) {
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
-       } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-       }
-
-       /* A streamout buffer offset is loaded if the stride is non-zero. */
-       for (int i = 0; i < 4; i++) {
-               if (!so->stride[i])
-                       continue;
-
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
-       }
+   if (ctx->screen->use_ngg_streamout) {
+      if (ctx->type == PIPE_SHADER_TESS_EVAL)
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      return;
+   }
+
+   /* Streamout SGPRs. */
+   if (so->num_outputs) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
+   } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+   }
+
+   /* A streamout buffer offset is loaded if the stride is non-zero. */
+   for (int i = 0; i < 4; i++) {
+      if (!so->stride[i])
+         continue;
+
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
+   }
  }
  
  unsigned si_get_max_workgroup_size(const struct si_shader *shader)
  {
-       switch (shader->selector->type) {
-       case PIPE_SHADER_VERTEX:
-       case PIPE_SHADER_TESS_EVAL:
-               return shader->key.as_ngg ? 128 : 0;
-
-       case PIPE_SHADER_TESS_CTRL:
-               /* Return this so that LLVM doesn't remove s_barrier
-                * instructions on chips where we use s_barrier. */
-               return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
-
-       case PIPE_SHADER_GEOMETRY:
-               return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
-
-       case PIPE_SHADER_COMPUTE:
-               break; /* see below */
-
-       default:
-               return 0;
-       }
-
-       const unsigned *properties = shader->selector->info.properties;
-       unsigned max_work_group_size =
-                      properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
-                      properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
-                      properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
-
-       if (!max_work_group_size) {
-               /* This is a variable group size compute shader,
-                * compile it for the maximum possible group size.
-                */
-               max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
-       }
-       return max_work_group_size;
+   switch (shader->selector->type) {
+   case PIPE_SHADER_VERTEX:
+   case PIPE_SHADER_TESS_EVAL:
+      return shader->key.as_ngg ? 128 : 0;
+
+   case PIPE_SHADER_TESS_CTRL:
+      /* Return this so that LLVM doesn't remove s_barrier
+       * instructions on chips where we use s_barrier. */
+      return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
+
+   case PIPE_SHADER_GEOMETRY:
+      return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
+
+   case PIPE_SHADER_COMPUTE:
+      break; /* see below */
+
+   default:
+      return 0;
+   }
+
+   const unsigned *properties = shader->selector->info.properties;
+   unsigned max_work_group_size = properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
+                                  properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
+                                  properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
+
+   if (!max_work_group_size) {
+      /* This is a variable group size compute shader,
+       * compile it for the maximum possible group size.
+       */
+      max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
+   }
+   return max_work_group_size;
  }
  
-static void declare_const_and_shader_buffers(struct si_shader_context *ctx,
-                                            bool assign_params)
+static void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params)
  {
-       enum ac_arg_type const_shader_buf_type;
+   enum ac_arg_type const_shader_buf_type;
  
-       if (ctx->shader->selector->info.const_buffers_declared == 1 &&
-           ctx->shader->selector->info.shader_buffers_declared == 0)
-               const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
-       else
-               const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
+   if (ctx->shader->selector->info.const_buffers_declared == 1 &&
+       ctx->shader->selector->info.shader_buffers_declared == 0)
+      const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
+   else
+      const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
  
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
-                  assign_params ? &ctx->const_and_shader_buffers :
-                  &ctx->other_const_and_shader_buffers);
+   ac_add_arg(
+      &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
+      assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers);
  }
  
-static void declare_samplers_and_images(struct si_shader_context *ctx,
-                                       bool assign_params)
+static void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params)
  {
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
-                  assign_params ? &ctx->samplers_and_images :
-                  &ctx->other_samplers_and_images);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
+              assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images);
  }
  
-static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
-                                           bool assign_params)
+static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params)
  {
-       declare_const_and_shader_buffers(ctx, assign_params);
-       declare_samplers_and_images(ctx, assign_params);
+   declare_const_and_shader_buffers(ctx, assign_params);
+   declare_samplers_and_images(ctx, assign_params);
  }
  
  static void declare_global_desc_pointers(struct si_shader_context *ctx)
  {
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
-                  &ctx->rw_buffers);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
-                  &ctx->bindless_samplers_and_images);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->rw_buffers);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
+              &ctx->bindless_samplers_and_images);
  }
  
  static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)
  {
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
-       if (!ctx->shader->is_gs_copy_shader) {
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
-       }
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
+   if (!ctx->shader->is_gs_copy_shader) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
+   }
  }
  
  static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
  {
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
  
-       unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
-       if (num_vbos_in_user_sgprs) {
-               unsigned user_sgprs = ctx->args.num_sgprs_used;
+   unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
+   if (num_vbos_in_user_sgprs) {
+      unsigned user_sgprs = ctx->args.num_sgprs_used;
  
-               if (si_is_merged_shader(ctx->shader))
-                       user_sgprs -= 8;
-               assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
+      if (si_is_merged_shader(ctx->shader))
+         user_sgprs -= 8;
+      assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
  
-               /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
-               for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
+      /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
+      for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
  
-               assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
-               for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
-       }
+      assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
+      for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
+   }
  }
  
-static void declare_vs_input_vgprs(struct si_shader_context *ctx,
-                                  unsigned *num_prolog_vgprs,
-                                  bool ngg_cull_shader)
+static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs,
+                                   bool ngg_cull_shader)
  {
-       struct si_shader *shader = ctx->shader;
-
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
-       if (shader->key.as_ls) {
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
-               if (ctx->screen->info.chip_class >= GFX10) {
-                       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
-                       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
-               } else {
-                       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
-                       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
-               }
-       } else if (ctx->screen->info.chip_class >= GFX10) {
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-                          &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
-       } else {
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
-       }
-
-       if (!shader->is_gs_copy_shader) {
-               if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
-                       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-                                  &ctx->ngg_old_thread_id);
-               }
-
-               /* Vertex load indices. */
-               if (shader->selector->info.num_inputs) {
-                       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-                                  &ctx->vertex_index0);
-                       for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
-                               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
-               }
-               *num_prolog_vgprs += shader->selector->info.num_inputs;
-       }
+   struct si_shader *shader = ctx->shader;
+
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
+   if (shader->key.as_ls) {
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
+      if (ctx->screen->info.chip_class >= GFX10) {
+         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
+         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
+      } else {
+         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
+         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
+      }
+   } else if (ctx->screen->info.chip_class >= GFX10) {
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
+                 &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
+   } else {
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
+   }
+
+   if (!shader->is_gs_copy_shader) {
+      if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
+         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
+      }
+
+      /* Vertex load indices. */
+      if (shader->selector->info.num_inputs) {
+         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
+         for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
+            ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+      }
+      *num_prolog_vgprs += shader->selector->info.num_inputs;
+   }
  }
  
-static void declare_vs_blit_inputs(struct si_shader_context *ctx,
-                                  unsigned vs_blit_property)
+static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property)
  {
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                  &ctx->vs_blit_inputs); /* i16 x1, y1 */
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
-
-       if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
-       } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
-       }
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);                 /* i16 x1, y1 */
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL);               /* depth */
+
+   if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
+   } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
+   }
  }
  
  static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader)
  {
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
-
-       if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-                          &ctx->ngg_old_thread_id);
-       }
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
+
+   if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
+   }
  }
  
-enum {
-       /* Convenient merged shader definitions. */
-       SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
-       SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
+enum
+{
+   /* Convenient merged shader definitions. */
+   SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
+   SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
  };
  
-void si_add_arg_checked(struct ac_shader_args *args,
-                       enum ac_arg_regfile file,
-                       unsigned registers, enum ac_arg_type type,
-                       struct ac_arg *arg,
-                       unsigned idx)
+void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
+                        enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
  {
-       assert(args->arg_count == idx);
-       ac_add_arg(args, file, registers, type, arg);
+   assert(args->arg_count == idx);
+   ac_add_arg(args, file, registers, type, arg);
  }
  
  void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
  {
-       struct si_shader *shader = ctx->shader;
-       LLVMTypeRef returns[AC_MAX_ARGS];
-       unsigned i, num_return_sgprs;
-       unsigned num_returns = 0;
-       unsigned num_prolog_vgprs = 0;
-       unsigned type = ctx->type;
-       unsigned vs_blit_property =
-               shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
-
-       memset(&ctx->args, 0, sizeof(ctx->args));
-
-       /* Set MERGED shaders. */
-       if (ctx->screen->info.chip_class >= GFX9) {
-               if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
-                       type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
-               else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY)
-                       type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
-       }
-
-       switch (type) {
-       case PIPE_SHADER_VERTEX:
-               declare_global_desc_pointers(ctx);
-
-               if (vs_blit_property) {
-                       declare_vs_blit_inputs(ctx, vs_blit_property);
-
-                       /* VGPRs */
-                       declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
-                       break;
-               }
-
-               declare_per_stage_desc_pointers(ctx, true);
-               declare_vs_specific_input_sgprs(ctx); 
-               if (!shader->is_gs_copy_shader)
-                       declare_vb_descriptor_input_sgprs(ctx);
-
-               if (shader->key.as_es) {
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                                  &ctx->es2gs_offset);
-               } else if (shader->key.as_ls) {
-                       /* no extra parameters */
-               } else {
-                       /* The locations of the other parameters are assigned dynamically. */
-                       declare_streamout_params(ctx, &shader->selector->so);
-               }
-
-               /* VGPRs */
-               declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
-
-               /* Return values */
-               if (shader->key.opt.vs_as_prim_discard_cs) {
-                       for (i = 0; i < 4; i++)
-                               returns[num_returns++] = ctx->ac.f32; /* VGPRs */
-               }
-               break;
-
-       case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
-               declare_global_desc_pointers(ctx);
-               declare_per_stage_desc_pointers(ctx, true);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
-
-               /* VGPRs */
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
-
-               /* param_tcs_offchip_offset and param_tcs_factor_offset are
-                * placed after the user SGPRs.
-                */
-               for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
-                       returns[num_returns++] = ctx->ac.i32; /* SGPRs */
-               for (i = 0; i < 11; i++)
-                       returns[num_returns++] = ctx->ac.f32; /* VGPRs */
-               break;
-
-       case SI_SHADER_MERGED_VERTEX_TESSCTRL:
-               /* Merged stages have 8 system SGPRs at the beginning. */
-               /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
-               declare_per_stage_desc_pointers(ctx,
-                                               ctx->type == PIPE_SHADER_TESS_CTRL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
-
-               declare_global_desc_pointers(ctx);
-               declare_per_stage_desc_pointers(ctx,
-                                               ctx->type == PIPE_SHADER_VERTEX);
-               declare_vs_specific_input_sgprs(ctx);
-
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
-               declare_vb_descriptor_input_sgprs(ctx);
-
-               /* VGPRs (first TCS, then VS) */
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
-
-               if (ctx->type == PIPE_SHADER_VERTEX) {
-                       declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
-
-                       /* LS return values are inputs to the TCS main shader part. */
-                       for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
-                               returns[num_returns++] = ctx->ac.i32; /* SGPRs */
-                       for (i = 0; i < 2; i++)
-                               returns[num_returns++] = ctx->ac.f32; /* VGPRs */
-               } else {
-                       /* TCS return values are inputs to the TCS epilog.
-                        *
-                        * param_tcs_offchip_offset, param_tcs_factor_offset,
-                        * param_tcs_offchip_layout, and param_rw_buffers
-                        * should be passed to the epilog.
-                        */
-                       for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
-                               returns[num_returns++] = ctx->ac.i32; /* SGPRs */
-                       for (i = 0; i < 11; i++)
-                               returns[num_returns++] = ctx->ac.f32; /* VGPRs */
-               }
-               break;
-
-       case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
-               /* Merged stages have 8 system SGPRs at the beginning. */
-               /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
-               declare_per_stage_desc_pointers(ctx,
-                                               ctx->type == PIPE_SHADER_GEOMETRY);
-
-               if (ctx->shader->key.as_ngg)
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
-               else
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
-
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
-                          &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
-
-               declare_global_desc_pointers(ctx);
-               if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) {
-                       declare_per_stage_desc_pointers(ctx,
-                                                       (ctx->type == PIPE_SHADER_VERTEX ||
-                                                        ctx->type == PIPE_SHADER_TESS_EVAL));
-               }
-
-               if (ctx->type == PIPE_SHADER_VERTEX) {
-                       if (vs_blit_property)
-                               declare_vs_blit_inputs(ctx, vs_blit_property);
-                       else
-                               declare_vs_specific_input_sgprs(ctx);
-               } else {
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
-                       /* Declare as many input SGPRs as the VS has. */
-               }
-
-               if (ctx->type == PIPE_SHADER_VERTEX)
-                       declare_vb_descriptor_input_sgprs(ctx);
-
-               /* VGPRs (first GS, then VS/TES) */
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
-
-               if (ctx->type == PIPE_SHADER_VERTEX) {
-                       declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
-               } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
-                       declare_tes_input_vgprs(ctx, ngg_cull_shader);
-               }
-
-               if ((ctx->shader->key.as_es || ngg_cull_shader) &&
-                   (ctx->type == PIPE_SHADER_VERTEX ||
-                    ctx->type == PIPE_SHADER_TESS_EVAL)) {
-                       unsigned num_user_sgprs, num_vgprs;
-
-                       if (ctx->type == PIPE_SHADER_VERTEX) {
-                               /* For the NGG cull shader, add 1 SGPR to hold
-                                * the vertex buffer pointer.
-                                */
-                               num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
-
-                               if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
-                                       assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
-                                       num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
-                                                        shader->selector->num_vbos_in_user_sgprs * 4;
-                               }
-                       } else {
-                               num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
-                       }
-
-                       /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
-                        *
-                        * The normal merged ESGS shader only has to return the 5 VGPRs
-                        * for the GS stage.
-                        */
-                       num_vgprs = ngg_cull_shader ? 10 : 5;
-
-                       /* ES return values are inputs to GS. */
-                       for (i = 0; i < 8 + num_user_sgprs; i++)
-                               returns[num_returns++] = ctx->ac.i32; /* SGPRs */
-                       for (i = 0; i < num_vgprs; i++)
-                               returns[num_returns++] = ctx->ac.f32; /* VGPRs */
-               }
-               break;
-
-       case PIPE_SHADER_TESS_EVAL:
-               declare_global_desc_pointers(ctx);
-               declare_per_stage_desc_pointers(ctx, true);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
-
-               if (shader->key.as_es) {
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
-               } else {
-                       declare_streamout_params(ctx, &shader->selector->so);
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
-               }
-
-               /* VGPRs */
-               declare_tes_input_vgprs(ctx, ngg_cull_shader);
-               break;
-
-       case PIPE_SHADER_GEOMETRY:
-               declare_global_desc_pointers(ctx);
-               declare_per_stage_desc_pointers(ctx, true);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
-
-               /* VGPRs */
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
-               break;
-
-       case PIPE_SHADER_FRAGMENT:
-               declare_global_desc_pointers(ctx);
-               declare_per_stage_desc_pointers(ctx, true);
-               si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL,
-                               SI_PARAM_ALPHA_REF);
-               si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                               &ctx->args.prim_mask, SI_PARAM_PRIM_MASK);
-
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
-                               SI_PARAM_PERSP_SAMPLE);
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
-                               &ctx->args.persp_center, SI_PARAM_PERSP_CENTER);
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
-                               &ctx->args.persp_centroid, SI_PARAM_PERSP_CENTROID);
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
-                               NULL, SI_PARAM_PERSP_PULL_MODEL);
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
-                               &ctx->args.linear_sample, SI_PARAM_LINEAR_SAMPLE);
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
-                               &ctx->args.linear_center, SI_PARAM_LINEAR_CENTER);
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
-                               &ctx->args.linear_centroid, SI_PARAM_LINEAR_CENTROID);
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT,
-                               NULL, SI_PARAM_LINE_STIPPLE_TEX);
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
-                               &ctx->args.frag_pos[0], SI_PARAM_POS_X_FLOAT);
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
-                               &ctx->args.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
-                               &ctx->args.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
-                               &ctx->args.frag_pos[3], SI_PARAM_POS_W_FLOAT);
-               shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-                               &ctx->args.front_face, SI_PARAM_FRONT_FACE);
-               shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-                               &ctx->args.ancillary, SI_PARAM_ANCILLARY);
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
-                               &ctx->args.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
-               si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-                               &ctx->pos_fixed_pt, SI_PARAM_POS_FIXED_PT);
-
-               /* Color inputs from the prolog. */
-               if (shader->selector->info.colors_read) {
-                       unsigned num_color_elements =
-                               util_bitcount(shader->selector->info.colors_read);
-
-                       for (i = 0; i < num_color_elements; i++)
-                               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
-
-                       num_prolog_vgprs += num_color_elements;
-               }
-
-               /* Outputs for the epilog. */
-               num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
-               num_returns =
-                       num_return_sgprs +
-                       util_bitcount(shader->selector->info.colors_written) * 4 +
-                       shader->selector->info.writes_z +
-                       shader->selector->info.writes_stencil +
-                       shader->selector->info.writes_samplemask +
-                       1 /* SampleMaskIn */;
-
-               num_returns = MAX2(num_returns,
-                                  num_return_sgprs +
-                                  PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
-
-               for (i = 0; i < num_return_sgprs; i++)
-                       returns[i] = ctx->ac.i32;
-               for (; i < num_returns; i++)
-                       returns[i] = ctx->ac.f32;
-               break;
-
-       case PIPE_SHADER_COMPUTE:
-               declare_global_desc_pointers(ctx);
-               declare_per_stage_desc_pointers(ctx, true);
-               if (shader->selector->info.uses_grid_size)
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT,
-                                  &ctx->args.num_work_groups);
-               if (shader->selector->info.uses_block_size &&
-                   shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
-
-               unsigned cs_user_data_dwords =
-                       shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
-               if (cs_user_data_dwords) {
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT,
-                                  &ctx->cs_user_data);
-               }
-
-               /* Hardware SGPRs. */
-               for (i = 0; i < 3; i++) {
-                       if (shader->selector->info.uses_block_id[i]) {
-                               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                                          &ctx->args.workgroup_ids[i]);
-                       }
-               }
-               if (shader->selector->info.uses_subgroup_info)
-                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
-
-               /* Hardware VGPRs. */
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
-                          &ctx->args.local_invocation_ids);
-               break;
-       default:
-               assert(0 && "unimplemented shader");
-               return;
-       }
-
-       si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main",
-                           returns, num_returns, si_get_max_workgroup_size(shader));
-
-       /* Reserve register locations for VGPR inputs the PS prolog may need. */
-       if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
-               ac_llvm_add_target_dep_function_attr(ctx->main_fn,
-                                                    "InitialPSInputAddr",
-                                                    S_0286D0_PERSP_SAMPLE_ENA(1) |
-                                                    S_0286D0_PERSP_CENTER_ENA(1) |
-                                                    S_0286D0_PERSP_CENTROID_ENA(1) |
-                                                    S_0286D0_LINEAR_SAMPLE_ENA(1) |
-                                                    S_0286D0_LINEAR_CENTER_ENA(1) |
-                                                    S_0286D0_LINEAR_CENTROID_ENA(1) |
-                                                    S_0286D0_FRONT_FACE_ENA(1) |
-                                                    S_0286D0_ANCILLARY_ENA(1) |
-                                                    S_0286D0_POS_FIXED_PT_ENA(1));
-       }
-
-       shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
-       shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
-
-       assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
-       shader->info.num_input_vgprs -= num_prolog_vgprs;
-
-       if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) {
-               if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
-                       /* The LSHS size is not known until draw time, so we append it
-                        * at the end of whatever LDS use there may be in the rest of
-                        * the shader (currently none, unless LLVM decides to do its
-                        * own LDS-based lowering).
-                        */
-                       ctx->ac.lds = LLVMAddGlobalInAddressSpace(
-                               ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
-                               "__lds_end", AC_ADDR_SPACE_LDS);
-                       LLVMSetAlignment(ctx->ac.lds, 256);
-               } else {
-                       ac_declare_lds_as_pointer(&ctx->ac);
-               }
-       }
-
-       /* Unlike radv, we override these arguments in the prolog, so to the
-        * API shader they appear as normal arguments.
-        */
-       if (ctx->type == PIPE_SHADER_VERTEX) {
-               ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
-               ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
-       } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
-               ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
-               ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
-       }
+   struct si_shader *shader = ctx->shader;
+   LLVMTypeRef returns[AC_MAX_ARGS];
+   unsigned i, num_return_sgprs;
+   unsigned num_returns = 0;
+   unsigned num_prolog_vgprs = 0;
+   unsigned type = ctx->type;
+   unsigned vs_blit_property = shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   /* Set MERGED shaders. */
+   if (ctx->screen->info.chip_class >= GFX9) {
+      if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
+         type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
+      else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY)
+         type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
+   }
+
+   switch (type) {
+   case PIPE_SHADER_VERTEX:
+      declare_global_desc_pointers(ctx);
+
+      if (vs_blit_property) {
+         declare_vs_blit_inputs(ctx, vs_blit_property);
+
+         /* VGPRs */
+         declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+         break;
+      }
+
+      declare_per_stage_desc_pointers(ctx, true);
+      declare_vs_specific_input_sgprs(ctx);
+      if (!shader->is_gs_copy_shader)
+         declare_vb_descriptor_input_sgprs(ctx);
+
+      if (shader->key.as_es) {
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
+      } else if (shader->key.as_ls) {
+         /* no extra parameters */
+      } else {
+         /* The locations of the other parameters are assigned dynamically. */
+         declare_streamout_params(ctx, &shader->selector->so);
+      }
+
+      /* VGPRs */
+      declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+
+      /* Return values */
+      if (shader->key.opt.vs_as_prim_discard_cs) {
+         for (i = 0; i < 4; i++)
+            returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+      }
+      break;
+
+   case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
+      declare_global_desc_pointers(ctx);
+      declare_per_stage_desc_pointers(ctx, true);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
+
+      /* VGPRs */
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
+
+      /* param_tcs_offchip_offset and param_tcs_factor_offset are
+       * placed after the user SGPRs.
+       */
+      for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
+         returns[num_returns++] = ctx->ac.i32; /* SGPRs */
+      for (i = 0; i < 11; i++)
+         returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+      break;
+
+   case SI_SHADER_MERGED_VERTEX_TESSCTRL:
+      /* Merged stages have 8 system SGPRs at the beginning. */
+      /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
+      declare_per_stage_desc_pointers(ctx, ctx->type == PIPE_SHADER_TESS_CTRL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
+
+      declare_global_desc_pointers(ctx);
+      declare_per_stage_desc_pointers(ctx, ctx->type == PIPE_SHADER_VERTEX);
+      declare_vs_specific_input_sgprs(ctx);
+
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
+      declare_vb_descriptor_input_sgprs(ctx);
+
+      /* VGPRs (first TCS, then VS) */
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
+
+      if (ctx->type == PIPE_SHADER_VERTEX) {
+         declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+
+         /* LS return values are inputs to the TCS main shader part. */
+         for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
+            returns[num_returns++] = ctx->ac.i32; /* SGPRs */
+         for (i = 0; i < 2; i++)
+            returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+      } else {
+         /* TCS return values are inputs to the TCS epilog.
+          *
+          * param_tcs_offchip_offset, param_tcs_factor_offset,
+          * param_tcs_offchip_layout, and param_rw_buffers
+          * should be passed to the epilog.
+          */
+         for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
+            returns[num_returns++] = ctx->ac.i32; /* SGPRs */
+         for (i = 0; i < 11; i++)
+            returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+      }
+      break;
+
+   case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
+      /* Merged stages have 8 system SGPRs at the beginning. */
+      /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
+      declare_per_stage_desc_pointers(ctx, ctx->type == PIPE_SHADER_GEOMETRY);
+
+      if (ctx->shader->key.as_ngg)
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
+      else
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
+
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
+                 &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
+                 NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
+
+      declare_global_desc_pointers(ctx);
+      if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) {
+         declare_per_stage_desc_pointers(
+            ctx, (ctx->type == PIPE_SHADER_VERTEX || ctx->type == PIPE_SHADER_TESS_EVAL));
+      }
+
+      if (ctx->type == PIPE_SHADER_VERTEX) {
+         if (vs_blit_property)
+            declare_vs_blit_inputs(ctx, vs_blit_property);
+         else
+            declare_vs_specific_input_sgprs(ctx);
+      } else {
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
+         /* Declare as many input SGPRs as the VS has. */
+      }
+
+      if (ctx->type == PIPE_SHADER_VERTEX)
+         declare_vb_descriptor_input_sgprs(ctx);
+
+      /* VGPRs (first GS, then VS/TES) */
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
+
+      if (ctx->type == PIPE_SHADER_VERTEX) {
+         declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+      } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
+         declare_tes_input_vgprs(ctx, ngg_cull_shader);
+      }
+
+      if ((ctx->shader->key.as_es || ngg_cull_shader) &&
+          (ctx->type == PIPE_SHADER_VERTEX || ctx->type == PIPE_SHADER_TESS_EVAL)) {
+         unsigned num_user_sgprs, num_vgprs;
+
+         if (ctx->type == PIPE_SHADER_VERTEX) {
+            /* For the NGG cull shader, add 1 SGPR to hold
+             * the vertex buffer pointer.
+             */
+            num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
+
+            if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
+               assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
+               num_user_sgprs =
+                  SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
+            }
+         } else {
+            num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+         }
+
+         /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
+          *
+          * The normal merged ESGS shader only has to return the 5 VGPRs
+          * for the GS stage.
+          */
+         num_vgprs = ngg_cull_shader ? 10 : 5;
+
+         /* ES return values are inputs to GS. */
+         for (i = 0; i < 8 + num_user_sgprs; i++)
+            returns[num_returns++] = ctx->ac.i32; /* SGPRs */
+         for (i = 0; i < num_vgprs; i++)
+            returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+      }
+      break;
+
+   case PIPE_SHADER_TESS_EVAL:
+      declare_global_desc_pointers(ctx);
+      declare_per_stage_desc_pointers(ctx, true);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
+
+      if (shader->key.as_es) {
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
+      } else {
+         declare_streamout_params(ctx, &shader->selector->so);
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+      }
+
+      /* VGPRs */
+      declare_tes_input_vgprs(ctx, ngg_cull_shader);
+      break;
+
+   case PIPE_SHADER_GEOMETRY:
+      declare_global_desc_pointers(ctx);
+      declare_per_stage_desc_pointers(ctx, true);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
+
+      /* VGPRs */
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
+      break;
+
+   case PIPE_SHADER_FRAGMENT:
+      declare_global_desc_pointers(ctx);
+      declare_per_stage_desc_pointers(ctx, true);
+      si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF);
+      si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask,
+                         SI_PARAM_PRIM_MASK);
+
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
+                         SI_PARAM_PERSP_SAMPLE);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center,
+                         SI_PARAM_PERSP_CENTER);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid,
+                         SI_PARAM_PERSP_CENTROID);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample,
+                         SI_PARAM_LINEAR_SAMPLE);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center,
+                         SI_PARAM_LINEAR_CENTER);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,
+                         SI_PARAM_LINEAR_CENTROID);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],
+                         SI_PARAM_POS_X_FLOAT);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],
+                         SI_PARAM_POS_Y_FLOAT);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2],
+                         SI_PARAM_POS_Z_FLOAT);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3],
+                         SI_PARAM_POS_W_FLOAT);
+      shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face,
+                         SI_PARAM_FRONT_FACE);
+      shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary,
+                         SI_PARAM_ANCILLARY);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage,
+                         SI_PARAM_SAMPLE_COVERAGE);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt,
+                         SI_PARAM_POS_FIXED_PT);
+
+      /* Color inputs from the prolog. */
+      if (shader->selector->info.colors_read) {
+         unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
+
+         for (i = 0; i < num_color_elements; i++)
+            ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
+
+         num_prolog_vgprs += num_color_elements;
+      }
+
+      /* Outputs for the epilog. */
+      num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
+      num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
+                    shader->selector->info.writes_z + shader->selector->info.writes_stencil +
+                    shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */;
+
+      num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+      for (i = 0; i < num_return_sgprs; i++)
+         returns[i] = ctx->ac.i32;
+      for (; i < num_returns; i++)
+         returns[i] = ctx->ac.f32;
+      break;
+
+   case PIPE_SHADER_COMPUTE:
+      declare_global_desc_pointers(ctx);
+      declare_per_stage_desc_pointers(ctx, true);
+      if (shader->selector->info.uses_grid_size)
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups);
+      if (shader->selector->info.uses_block_size &&
+          shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
+
+      unsigned cs_user_data_dwords =
+         shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
+      if (cs_user_data_dwords) {
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data);
+      }
+
+      /* Hardware SGPRs. */
+      for (i = 0; i < 3; i++) {
+         if (shader->selector->info.uses_block_id[i]) {
+            ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]);
+         }
+      }
+      if (shader->selector->info.uses_subgroup_info)
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
+
+      /* Hardware VGPRs. */
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids);
+      break;
+   default:
+      assert(0 && "unimplemented shader");
+      return;
+   }
+
+   si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", returns, num_returns,
+                       si_get_max_workgroup_size(shader));
+
+   /* Reserve register locations for VGPR inputs the PS prolog may need. */
+   if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
+      ac_llvm_add_target_dep_function_attr(
+         ctx->main_fn, "InitialPSInputAddr",
+         S_0286D0_PERSP_SAMPLE_ENA(1) | S_0286D0_PERSP_CENTER_ENA(1) |
+            S_0286D0_PERSP_CENTROID_ENA(1) | S_0286D0_LINEAR_SAMPLE_ENA(1) |
+            S_0286D0_LINEAR_CENTER_ENA(1) | S_0286D0_LINEAR_CENTROID_ENA(1) |
+            S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_ANCILLARY_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1));
+   }
+
+   shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
+   shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
+
+   assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
+   shader->info.num_input_vgprs -= num_prolog_vgprs;
+
+   if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) {
+      if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
+         /* The LSHS size is not known until draw time, so we append it
+          * at the end of whatever LDS use there may be in the rest of
+          * the shader (currently none, unless LLVM decides to do its
+          * own LDS-based lowering).
+          */
+         ctx->ac.lds = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
+                                                   "__lds_end", AC_ADDR_SPACE_LDS);
+         LLVMSetAlignment(ctx->ac.lds, 256);
+      } else {
+         ac_declare_lds_as_pointer(&ctx->ac);
+      }
+   }
+
+   /* Unlike radv, we override these arguments in the prolog, so to the
+    * API shader they appear as normal arguments.
+    */
+   if (ctx->type == PIPE_SHADER_VERTEX) {
+      ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
+      ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
+   } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
+      ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
+      ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
+   }
  }
  
  /* For the UMR disassembler. */
-#define DEBUGGER_END_OF_CODE_MARKER    0xbf9f0000 /* invalid instruction */
-#define DEBUGGER_NUM_MARKERS           5
+#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
+#define DEBUGGER_NUM_MARKERS        5
  
-static bool si_shader_binary_open(struct si_screen *screen,
-                                 struct si_shader *shader,
-                                 struct ac_rtld_binary *rtld)
+static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
+                                  struct ac_rtld_binary *rtld)
  {
-       const struct si_shader_selector *sel = shader->selector;
-       const char *part_elfs[5];
-       size_t part_sizes[5];
-       unsigned num_parts = 0;
-
-#define add_part(shader_or_part) \
-       if (shader_or_part) { \
-               part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
-               part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
-               num_parts++; \
-       }
-
-       add_part(shader->prolog);
-       add_part(shader->previous_stage);
-       add_part(shader->prolog2);
-       add_part(shader);
-       add_part(shader->epilog);
+   const struct si_shader_selector *sel = shader->selector;
+   const char *part_elfs[5];
+   size_t part_sizes[5];
+   unsigned num_parts = 0;
+
+#define add_part(shader_or_part)                                                                   \
+   if (shader_or_part) {                                                                           \
+      part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer;                                  \
+      part_sizes[num_parts] = (shader_or_part)->binary.elf_size;                                   \
+      num_parts++;                                                                                 \
+   }
+
+   add_part(shader->prolog);
+   add_part(shader->previous_stage);
+   add_part(shader->prolog2);
+   add_part(shader);
+   add_part(shader->epilog);
  
  #undef add_part
  
-       struct ac_rtld_symbol lds_symbols[2];
-       unsigned num_lds_symbols = 0;
-
-       if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
-           (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) {
-               /* We add this symbol even on LLVM <= 8 to ensure that
-                * shader->config.lds_size is set correctly below.
-                */
-               struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
-               sym->name = "esgs_ring";
-               sym->size = shader->gs_info.esgs_ring_size;
-               sym->align = 64 * 1024;
-       }
-
-       if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) {
-               struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
-               sym->name = "ngg_emit";
-               sym->size = shader->ngg.ngg_emit_size * 4;
-               sym->align = 4;
-       }
-
-       bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){
-                       .info = &screen->info,
-                       .options = {
-                               .halt_at_entry = screen->options.halt_shaders,
-                       },
-                       .shader_type = tgsi_processor_to_shader_stage(sel->type),
-                       .wave_size = si_get_shader_wave_size(shader),
-                       .num_parts = num_parts,
-                       .elf_ptrs = part_elfs,
-                       .elf_sizes = part_sizes,
-                       .num_shared_lds_symbols = num_lds_symbols,
-                       .shared_lds_symbols = lds_symbols });
-
-       if (rtld->lds_size > 0) {
-               unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
-               shader->config.lds_size =
-                       align(rtld->lds_size, alloc_granularity) / alloc_granularity;
-       }
-
-       return ok;
+   struct ac_rtld_symbol lds_symbols[2];
+   unsigned num_lds_symbols = 0;
+
+   if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
+       (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) {
+      /* We add this symbol even on LLVM <= 8 to ensure that
+       * shader->config.lds_size is set correctly below.
+       */
+      struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
+      sym->name = "esgs_ring";
+      sym->size = shader->gs_info.esgs_ring_size;
+      sym->align = 64 * 1024;
+   }
+
+   if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) {
+      struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
+      sym->name = "ngg_emit";
+      sym->size = shader->ngg.ngg_emit_size * 4;
+      sym->align = 4;
+   }
+
+   bool ok = ac_rtld_open(
+      rtld, (struct ac_rtld_open_info){.info = &screen->info,
+                                       .options =
+                                          {
+                                             .halt_at_entry = screen->options.halt_shaders,
+                                          },
+                                       .shader_type = tgsi_processor_to_shader_stage(sel->type),
+                                       .wave_size = si_get_shader_wave_size(shader),
+                                       .num_parts = num_parts,
+                                       .elf_ptrs = part_elfs,
+                                       .elf_sizes = part_sizes,
+                                       .num_shared_lds_symbols = num_lds_symbols,
+                                       .shared_lds_symbols = lds_symbols});
+
+   if (rtld->lds_size > 0) {
+      unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
+      shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity;
+   }
+
+   return ok;
  }
  
  static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
  {
-       struct ac_rtld_binary rtld;
-       si_shader_binary_open(screen, shader, &rtld);
-       return rtld.exec_size;
+   struct ac_rtld_binary rtld;
+   si_shader_binary_open(screen, shader, &rtld);
+   return rtld.exec_size;
  }
  
  static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
  {
-       uint64_t *scratch_va = data;
-
-       if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
-               *value = (uint32_t)*scratch_va;
-               return true;
-       }
-       if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
-               /* Enable scratch coalescing. */
-               *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) |
-                        S_008F04_SWIZZLE_ENABLE(1);
-               return true;
-       }
-
-       return false;
+   uint64_t *scratch_va = data;
+
+   if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
+      *value = (uint32_t)*scratch_va;
+      return true;
+   }
+   if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
+      /* Enable scratch coalescing. */
+      *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1);
+      return true;
+   }
+
+   return false;
  }
  
  bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
-                            uint64_t scratch_va)
+                             uint64_t scratch_va)
  {
-       struct ac_rtld_binary binary;
-       if (!si_shader_binary_open(sscreen, shader, &binary))
-               return false;
-
-       si_resource_reference(&shader->bo, NULL);
-       shader->bo = si_aligned_buffer_create(&sscreen->b,
-                                             sscreen->info.cpdma_prefetch_writes_memory ?
-                                               0 : SI_RESOURCE_FLAG_READ_ONLY,
-                                              PIPE_USAGE_IMMUTABLE,
-                                              align(binary.rx_size, SI_CPDMA_ALIGNMENT),
-                                              256);
-       if (!shader->bo)
-               return false;
-
-       /* Upload. */
-       struct ac_rtld_upload_info u = {};
-       u.binary = &binary;
-       u.get_external_symbol = si_get_external_symbol;
-       u.cb_data = &scratch_va;
-       u.rx_va = shader->bo->gpu_address;
-       u.rx_ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
-                                       PIPE_TRANSFER_READ_WRITE |
-                                       PIPE_TRANSFER_UNSYNCHRONIZED |
-                                       RADEON_TRANSFER_TEMPORARY);
-       if (!u.rx_ptr)
-               return false;
-
-       bool ok = ac_rtld_upload(&u);
-
-       sscreen->ws->buffer_unmap(shader->bo->buf);
-       ac_rtld_close(&binary);
-
-       return ok;
+   struct ac_rtld_binary binary;
+   if (!si_shader_binary_open(sscreen, shader, &binary))
+      return false;
+
+   si_resource_reference(&shader->bo, NULL);
+   shader->bo = si_aligned_buffer_create(
+      &sscreen->b, sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY,
+      PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);
+   if (!shader->bo)
+      return false;
+
+   /* Upload. */
+   struct ac_rtld_upload_info u = {};
+   u.binary = &binary;
+   u.get_external_symbol = si_get_external_symbol;
+   u.cb_data = &scratch_va;
+   u.rx_va = shader->bo->gpu_address;
+   u.rx_ptr = sscreen->ws->buffer_map(
+      shader->bo->buf, NULL,
+      PIPE_TRANSFER_READ_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED | RADEON_TRANSFER_TEMPORARY);
+   if (!u.rx_ptr)
+      return false;
+
+   bool ok = ac_rtld_upload(&u);
+
+   sscreen->ws->buffer_unmap(shader->bo->buf);
+   ac_rtld_close(&binary);
+
+   return ok;
  }
  
  static void si_shader_dump_disassembly(struct si_screen *screen,
-                                      const struct si_shader_binary *binary,
-                                      enum pipe_shader_type shader_type,
-                                      unsigned wave_size,
-                                      struct pipe_debug_callback *debug,
-                                      const char *name, FILE *file)
+                                       const struct si_shader_binary *binary,
+                                       enum pipe_shader_type shader_type, unsigned wave_size,
+                                       struct pipe_debug_callback *debug, const char *name,
+                                       FILE *file)
  {
-       struct ac_rtld_binary rtld_binary;
-
-       if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
-                       .info = &screen->info,
-                       .shader_type = tgsi_processor_to_shader_stage(shader_type),
-                       .wave_size = wave_size,
-                       .num_parts = 1,
-                       .elf_ptrs = &binary->elf_buffer,
-                       .elf_sizes = &binary->elf_size }))
-               return;
-
-       const char *disasm;
-       size_t nbytes;
-
-       if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
-               goto out;
-
-       if (nbytes > INT_MAX)
-               goto out;
-
-       if (debug && debug->debug_message) {
-               /* Very long debug messages are cut off, so send the
-                * disassembly one line at a time. This causes more
-                * overhead, but on the plus side it simplifies
-                * parsing of resulting logs.
-                */
-               pipe_debug_message(debug, SHADER_INFO,
-                                  "Shader Disassembly Begin");
-
-               uint64_t line = 0;
-               while (line < nbytes) {
-                       int count = nbytes - line;
-                       const char *nl = memchr(disasm + line, '\n', nbytes - line);
-                       if (nl)
-                               count = nl - (disasm + line);
-
-                       if (count) {
-                               pipe_debug_message(debug, SHADER_INFO,
-                                                  "%.*s", count, disasm + line);
-                       }
-
-                       line += count + 1;
-               }
-
-               pipe_debug_message(debug, SHADER_INFO,
-                                  "Shader Disassembly End");
-       }
-
-       if (file) {
-               fprintf(file, "Shader %s disassembly:\n", name);
-               fprintf(file, "%*s", (int)nbytes, disasm);
-       }
+   struct ac_rtld_binary rtld_binary;
+
+   if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
+                                      .info = &screen->info,
+                                      .shader_type = tgsi_processor_to_shader_stage(shader_type),
+                                      .wave_size = wave_size,
+                                      .num_parts = 1,
+                                      .elf_ptrs = &binary->elf_buffer,
+                                      .elf_sizes = &binary->elf_size}))
+      return;
+
+   const char *disasm;
+   size_t nbytes;
+
+   if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
+      goto out;
+
+   if (nbytes > INT_MAX)
+      goto out;
+
+   if (debug && debug->debug_message) {
+      /* Very long debug messages are cut off, so send the
+       * disassembly one line at a time. This causes more
+       * overhead, but on the plus side it simplifies
+       * parsing of resulting logs.
+       */
+      pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
+
+      uint64_t line = 0;
+      while (line < nbytes) {
+         int count = nbytes - line;
+         const char *nl = memchr(disasm + line, '\n', nbytes - line);
+         if (nl)
+            count = nl - (disasm + line);
+
+         if (count) {
+            pipe_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
+         }
+
+         line += count + 1;
+      }
+
+      pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
+   }
+
+   if (file) {
+      fprintf(file, "Shader %s disassembly:\n", name);
+      fprintf(file, "%*s", (int)nbytes, disasm);
+   }
  
  out:
-       ac_rtld_close(&rtld_binary);
+   ac_rtld_close(&rtld_binary);
  }
  
  static void si_calculate_max_simd_waves(struct si_shader *shader)
  {
-       struct si_screen *sscreen = shader->selector->screen;
-       struct ac_shader_config *conf = &shader->config;
-       unsigned num_inputs = shader->selector->info.num_inputs;
-       unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
-       unsigned lds_per_wave = 0;
-       unsigned max_simd_waves;
-
-       max_simd_waves = sscreen->info.max_wave64_per_simd;
-
-       /* Compute LDS usage for PS. */
-       switch (shader->selector->type) {
-       case PIPE_SHADER_FRAGMENT:
-               /* The minimum usage per wave is (num_inputs * 48). The maximum
-                * usage is (num_inputs * 48 * 16).
-                * We can get anything in between and it varies between waves.
-                *
-                * The 48 bytes per input for a single primitive is equal to
-                * 4 bytes/component * 4 components/input * 3 points.
-                *
-                * Other stages don't know the size at compile time or don't
-                * allocate LDS per wave, but instead they do it per thread group.
-                */
-               lds_per_wave = conf->lds_size * lds_increment +
-                              align(num_inputs * 48, lds_increment);
-               break;
-       case PIPE_SHADER_COMPUTE:
-               if (shader->selector) {
-                       unsigned max_workgroup_size =
-                               si_get_max_workgroup_size(shader);
-                       lds_per_wave = (conf->lds_size * lds_increment) /
-                                      DIV_ROUND_UP(max_workgroup_size,
-                                                   sscreen->compute_wave_size);
-               }
-               break;
-       default:;
-       }
-
-       /* Compute the per-SIMD wave counts. */
-       if (conf->num_sgprs) {
-               max_simd_waves =
-                       MIN2(max_simd_waves,
-                            sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
-       }
-
-       if (conf->num_vgprs) {
-               /* Always print wave limits as Wave64, so that we can compare
-                * Wave32 and Wave64 with shader-db fairly. */
-               unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
-               max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
-       }
-
-       unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
-       if (lds_per_wave)
-               max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
-
-       shader->info.max_simd_waves = max_simd_waves;
+   struct si_screen *sscreen = shader->selector->screen;
+   struct ac_shader_config *conf = &shader->config;
+   unsigned num_inputs = shader->selector->info.num_inputs;
+   unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
+   unsigned lds_per_wave = 0;
+   unsigned max_simd_waves;
+
+   max_simd_waves = sscreen->info.max_wave64_per_simd;
+
+   /* Compute LDS usage for PS. */
+   switch (shader->selector->type) {
+   case PIPE_SHADER_FRAGMENT:
+      /* The minimum usage per wave is (num_inputs * 48). The maximum
+       * usage is (num_inputs * 48 * 16).
+       * We can get anything in between and it varies between waves.
+       *
+       * The 48 bytes per input for a single primitive is equal to
+       * 4 bytes/component * 4 components/input * 3 points.
+       *
+       * Other stages don't know the size at compile time or don't
+       * allocate LDS per wave, but instead they do it per thread group.
+       */
+      lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment);
+      break;
+   case PIPE_SHADER_COMPUTE:
+      if (shader->selector) {
+         unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
+         lds_per_wave = (conf->lds_size * lds_increment) /
+                        DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size);
+      }
+      break;
+   default:;
+   }
+
+   /* Compute the per-SIMD wave counts. */
+   if (conf->num_sgprs) {
+      max_simd_waves =
+         MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
+   }
+
+   if (conf->num_vgprs) {
+      /* Always print wave limits as Wave64, so that we can compare
+       * Wave32 and Wave64 with shader-db fairly. */
+      unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
+      max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
+   }
+
+   unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
+   if (lds_per_wave)
+      max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
+
+   shader->info.max_simd_waves = max_simd_waves;
  }
  
-void si_shader_dump_stats_for_shader_db(struct si_screen *screen,
-                                       struct si_shader *shader,
-                                       struct pipe_debug_callback *debug)
+void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
+                                        struct pipe_debug_callback *debug)
  {
-       const struct ac_shader_config *conf = &shader->config;
-
-       if (screen->options.debug_disassembly)
-               si_shader_dump_disassembly(screen, &shader->binary,
-                                          shader->selector->type,
-                                          si_get_shader_wave_size(shader),
-                                          debug, "main", NULL);
-
-       pipe_debug_message(debug, SHADER_INFO,
-                          "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
-                          "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
-                          "Spilled VGPRs: %d PrivMem VGPRs: %d",
-                          conf->num_sgprs, conf->num_vgprs,
-                          si_get_shader_binary_size(screen, shader),
-                          conf->lds_size, conf->scratch_bytes_per_wave,
-                          shader->info.max_simd_waves, conf->spilled_sgprs,
-                          conf->spilled_vgprs, shader->info.private_mem_vgprs);
+   const struct ac_shader_config *conf = &shader->config;
+
+   if (screen->options.debug_disassembly)
+      si_shader_dump_disassembly(screen, &shader->binary, shader->selector->type,
+                                 si_get_shader_wave_size(shader), debug, "main", NULL);
+
+   pipe_debug_message(debug, SHADER_INFO,
+                      "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
+                      "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
+                      "Spilled VGPRs: %d PrivMem VGPRs: %d",
+                      conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
+                      conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
+                      conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs);
  }
  
-static void si_shader_dump_stats(struct si_screen *sscreen,
-                                struct si_shader *shader,
-                                FILE *file,
-                                bool check_debug_option)
+static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
+                                 bool check_debug_option)
  {
-       const struct ac_shader_config *conf = &shader->config;
-
-       if (!check_debug_option ||
-           si_can_dump_shader(sscreen, shader->selector->type)) {
-               if (shader->selector->type == PIPE_SHADER_FRAGMENT) {
-                       fprintf(file, "*** SHADER CONFIG ***\n"
-                               "SPI_PS_INPUT_ADDR = 0x%04x\n"
-                               "SPI_PS_INPUT_ENA  = 0x%04x\n",
-                               conf->spi_ps_input_addr, conf->spi_ps_input_ena);
-               }
-
-               fprintf(file, "*** SHADER STATS ***\n"
-                       "SGPRS: %d\n"
-                       "VGPRS: %d\n"
-                       "Spilled SGPRs: %d\n"
-                       "Spilled VGPRs: %d\n"
-                       "Private memory VGPRs: %d\n"
-                       "Code Size: %d bytes\n"
-                       "LDS: %d blocks\n"
-                       "Scratch: %d bytes per wave\n"
-                       "Max Waves: %d\n"
-                       "********************\n\n\n",
-                       conf->num_sgprs, conf->num_vgprs,
-                       conf->spilled_sgprs, conf->spilled_vgprs,
-                       shader->info.private_mem_vgprs,
-                       si_get_shader_binary_size(sscreen, shader),
-                       conf->lds_size, conf->scratch_bytes_per_wave,
-                       shader->info.max_simd_waves);
-       }
+   const struct ac_shader_config *conf = &shader->config;
+
+   if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->type)) {
+      if (shader->selector->type == PIPE_SHADER_FRAGMENT) {
+         fprintf(file,
+                 "*** SHADER CONFIG ***\n"
+                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
+                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
+                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
+      }
+
+      fprintf(file,
+              "*** SHADER STATS ***\n"
+              "SGPRS: %d\n"
+              "VGPRS: %d\n"
+              "Spilled SGPRs: %d\n"
+              "Spilled VGPRs: %d\n"
+              "Private memory VGPRs: %d\n"
+              "Code Size: %d bytes\n"
+              "LDS: %d blocks\n"
+              "Scratch: %d bytes per wave\n"
+              "Max Waves: %d\n"
+              "********************\n\n\n",
+              conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
+              shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
+              conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
+   }
  }
  
  const char *si_get_shader_name(const struct si_shader *shader)
  {
-       switch (shader->selector->type) {
-       case PIPE_SHADER_VERTEX:
-               if (shader->key.as_es)
-                       return "Vertex Shader as ES";
-               else if (shader->key.as_ls)
-                       return "Vertex Shader as LS";
-               else if (shader->key.opt.vs_as_prim_discard_cs)
-                       return "Vertex Shader as Primitive Discard CS";
-               else if (shader->key.as_ngg)
-                       return "Vertex Shader as ESGS";
-               else
-                       return "Vertex Shader as VS";
-       case PIPE_SHADER_TESS_CTRL:
-               return "Tessellation Control Shader";
-       case PIPE_SHADER_TESS_EVAL:
-               if (shader->key.as_es)
-                       return "Tessellation Evaluation Shader as ES";
-               else if (shader->key.as_ngg)
-                       return "Tessellation Evaluation Shader as ESGS";
-               else
-                       return "Tessellation Evaluation Shader as VS";
-       case PIPE_SHADER_GEOMETRY:
-               if (shader->is_gs_copy_shader)
-                       return "GS Copy Shader as VS";
-               else
-                       return "Geometry Shader";
-       case PIPE_SHADER_FRAGMENT:
-               return "Pixel Shader";
-       case PIPE_SHADER_COMPUTE:
-               return "Compute Shader";
-       default:
-               return "Unknown Shader";
-       }
+   switch (shader->selector->type) {
+   case PIPE_SHADER_VERTEX:
+      if (shader->key.as_es)
+         return "Vertex Shader as ES";
+      else if (shader->key.as_ls)
+         return "Vertex Shader as LS";
+      else if (shader->key.opt.vs_as_prim_discard_cs)
+         return "Vertex Shader as Primitive Discard CS";
+      else if (shader->key.as_ngg)
+         return "Vertex Shader as ESGS";
+      else
+         return "Vertex Shader as VS";
+   case PIPE_SHADER_TESS_CTRL:
+      return "Tessellation Control Shader";
+   case PIPE_SHADER_TESS_EVAL:
+      if (shader->key.as_es)
+         return "Tessellation Evaluation Shader as ES";
+      else if (shader->key.as_ngg)
+         return "Tessellation Evaluation Shader as ESGS";
+      else
+         return "Tessellation Evaluation Shader as VS";
+   case PIPE_SHADER_GEOMETRY:
+      if (shader->is_gs_copy_shader)
+         return "GS Copy Shader as VS";
+      else
+         return "Geometry Shader";
+   case PIPE_SHADER_FRAGMENT:
+      return "Pixel Shader";
+   case PIPE_SHADER_COMPUTE:
+      return "Compute Shader";
+   default:
+      return "Unknown Shader";
+   }
  }
  
  void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
-                   struct pipe_debug_callback *debug,
-                   FILE *file, bool check_debug_option)
+                    struct pipe_debug_callback *debug, FILE *file, bool check_debug_option)
  {
-       enum pipe_shader_type shader_type = shader->selector->type;
-
-       if (!check_debug_option ||
-           si_can_dump_shader(sscreen, shader_type))
-               si_dump_shader_key(shader, file);
-
-       if (!check_debug_option && shader->binary.llvm_ir_string) {
-               if (shader->previous_stage &&
-                   shader->previous_stage->binary.llvm_ir_string) {
-                       fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
-                               si_get_shader_name(shader));
-                       fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
-               }
-
-               fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
-                       si_get_shader_name(shader));
-               fprintf(file, "%s\n", shader->binary.llvm_ir_string);
-       }
-
-       if (!check_debug_option ||
-           (si_can_dump_shader(sscreen, shader_type) &&
-            !(sscreen->debug_flags & DBG(NO_ASM)))) {
-               unsigned wave_size = si_get_shader_wave_size(shader);
-
-               fprintf(file, "\n%s:\n", si_get_shader_name(shader));
-
-               if (shader->prolog)
-                       si_shader_dump_disassembly(sscreen, &shader->prolog->binary,
-                                                  shader_type, wave_size, debug, "prolog", file);
-               if (shader->previous_stage)
-                       si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary,
-                                                  shader_type, wave_size, debug, "previous stage", file);
-               if (shader->prolog2)
-                       si_shader_dump_disassembly(sscreen, &shader->prolog2->binary,
-                                                  shader_type, wave_size, debug, "prolog2", file);
-
-               si_shader_dump_disassembly(sscreen, &shader->binary, shader_type,
-                                          wave_size, debug, "main", file);
-
-               if (shader->epilog)
-                       si_shader_dump_disassembly(sscreen, &shader->epilog->binary,
-                                                  shader_type, wave_size, debug, "epilog", file);
-               fprintf(file, "\n");
-       }
-
-       si_shader_dump_stats(sscreen, shader, file, check_debug_option);
+   enum pipe_shader_type shader_type = shader->selector->type;
+
+   if (!check_debug_option || si_can_dump_shader(sscreen, shader_type))
+      si_dump_shader_key(shader, file);
+
+   if (!check_debug_option && shader->binary.llvm_ir_string) {
+      if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
+         fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
+         fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
+      }
+
+      fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
+      fprintf(file, "%s\n", shader->binary.llvm_ir_string);
+   }
+
+   if (!check_debug_option ||
+       (si_can_dump_shader(sscreen, shader_type) && !(sscreen->debug_flags & DBG(NO_ASM)))) {
+      unsigned wave_size = si_get_shader_wave_size(shader);
+
+      fprintf(file, "\n%s:\n", si_get_shader_name(shader));
+
+      if (shader->prolog)
+         si_shader_dump_disassembly(sscreen, &shader->prolog->binary, shader_type, wave_size, debug,
+                                    "prolog", file);
+      if (shader->previous_stage)
+         si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, shader_type,
+                                    wave_size, debug, "previous stage", file);
+      if (shader->prolog2)
+         si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, shader_type, wave_size,
+                                    debug, "prolog2", file);
+
+      si_shader_dump_disassembly(sscreen, &shader->binary, shader_type, wave_size, debug, "main",
+                                 file);
+
+      if (shader->epilog)
+         si_shader_dump_disassembly(sscreen, &shader->epilog->binary, shader_type, wave_size, debug,
+                                    "epilog", file);
+      fprintf(file, "\n");
+   }
+
+   si_shader_dump_stats(sscreen, shader, file, check_debug_option);
  }
  
  static void si_dump_shader_key_vs(const struct si_shader_key *key,
-                                 const struct si_vs_prolog_bits *prolog,
-                                 const char *prefix, FILE *f)
+                                  const struct si_vs_prolog_bits *prolog, const char *prefix,
+                                  FILE *f)
  {
-       fprintf(f, "  %s.instance_divisor_is_one = %u\n",
-               prefix, prolog->instance_divisor_is_one);
-       fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
-               prefix, prolog->instance_divisor_is_fetched);
-       fprintf(f, "  %s.unpack_instance_id_from_vertex_id = %u\n",
-               prefix, prolog->unpack_instance_id_from_vertex_id);
-       fprintf(f, "  %s.ls_vgpr_fix = %u\n",
-               prefix, prolog->ls_vgpr_fix);
-
-       fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
-       fprintf(f, "  mono.vs.fix_fetch = {");
-       for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
-               union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
-               if (i)
-                       fprintf(f, ", ");
-               if (!fix.bits)
-                       fprintf(f, "0");
-               else
-                       fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size,
-                               fix.u.num_channels_m1, fix.u.format);
-       }
-       fprintf(f, "}\n");
+   fprintf(f, "  %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);
+   fprintf(f, "  %s.instance_divisor_is_fetched = %u\n", prefix,
+           prolog->instance_divisor_is_fetched);
+   fprintf(f, "  %s.unpack_instance_id_from_vertex_id = %u\n", prefix,
+           prolog->unpack_instance_id_from_vertex_id);
+   fprintf(f, "  %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);
+
+   fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
+   fprintf(f, "  mono.vs.fix_fetch = {");
+   for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
+      union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
+      if (i)
+         fprintf(f, ", ");
+      if (!fix.bits)
+         fprintf(f, "0");
+      else
+         fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
+                 fix.u.format);
+   }
+   fprintf(f, "}\n");
  }
  
  static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
  {
-       const struct si_shader_key *key = &shader->key;
-       enum pipe_shader_type shader_type = shader->selector->type;
-
-       fprintf(f, "SHADER KEY\n");
-
-       switch (shader_type) {
-       case PIPE_SHADER_VERTEX:
-               si_dump_shader_key_vs(key, &key->part.vs.prolog,
-                                     "part.vs.prolog", f);
-               fprintf(f, "  as_es = %u\n", key->as_es);
-               fprintf(f, "  as_ls = %u\n", key->as_ls);
-               fprintf(f, "  as_ngg = %u\n", key->as_ngg);
-               fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
-                       key->mono.u.vs_export_prim_id);
-               fprintf(f, "  opt.vs_as_prim_discard_cs = %u\n",
-                       key->opt.vs_as_prim_discard_cs);
-               fprintf(f, "  opt.cs_prim_type = %s\n",
-                       tgsi_primitive_names[key->opt.cs_prim_type]);
-               fprintf(f, "  opt.cs_indexed = %u\n",
-                       key->opt.cs_indexed);
-               fprintf(f, "  opt.cs_instancing = %u\n",
-                       key->opt.cs_instancing);
-               fprintf(f, "  opt.cs_primitive_restart = %u\n",
-                       key->opt.cs_primitive_restart);
-               fprintf(f, "  opt.cs_provoking_vertex_first = %u\n",
-                       key->opt.cs_provoking_vertex_first);
-               fprintf(f, "  opt.cs_need_correct_orientation = %u\n",
-                       key->opt.cs_need_correct_orientation);
-               fprintf(f, "  opt.cs_cull_front = %u\n",
-                       key->opt.cs_cull_front);
-               fprintf(f, "  opt.cs_cull_back = %u\n",
-                       key->opt.cs_cull_back);
-               fprintf(f, "  opt.cs_cull_z = %u\n",
-                       key->opt.cs_cull_z);
-               fprintf(f, "  opt.cs_halfz_clip_space = %u\n",
-                       key->opt.cs_halfz_clip_space);
-               break;
-
-       case PIPE_SHADER_TESS_CTRL:
-               if (shader->selector->screen->info.chip_class >= GFX9) {
-                       si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
-                                             "part.tcs.ls_prolog", f);
-               }
-               fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
-               fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
-               break;
-
-       case PIPE_SHADER_TESS_EVAL:
-               fprintf(f, "  as_es = %u\n", key->as_es);
-               fprintf(f, "  as_ngg = %u\n", key->as_ngg);
-               fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
-                       key->mono.u.vs_export_prim_id);
-               break;
-
-       case PIPE_SHADER_GEOMETRY:
-               if (shader->is_gs_copy_shader)
-                       break;
-
-               if (shader->selector->screen->info.chip_class >= GFX9 &&
-                   key->part.gs.es->type == PIPE_SHADER_VERTEX) {
-                       si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
-                                             "part.gs.vs_prolog", f);
-               }
-               fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
-               fprintf(f, "  part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
-               fprintf(f, "  as_ngg = %u\n", key->as_ngg);
-               break;
-
-       case PIPE_SHADER_COMPUTE:
-               break;
-
-       case PIPE_SHADER_FRAGMENT:
-               fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
-               fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
-               fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
-               fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
-               fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
-               fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
-               fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
-               fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
-               fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
-               fprintf(f, "  part.ps.prolog.samplemask_log_ps_iter = %u\n", key->part.ps.prolog.samplemask_log_ps_iter);
-               fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
-               fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
-               fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
-               fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
-               fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
-               fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
-               fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
-               fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
-               fprintf(f, "  mono.u.ps.interpolate_at_sample_force_center = %u\n", key->mono.u.ps.interpolate_at_sample_force_center);
-               fprintf(f, "  mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
-               fprintf(f, "  mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
-               fprintf(f, "  mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
-               break;
-
-       default:
-               assert(0);
-       }
-
-       if ((shader_type == PIPE_SHADER_GEOMETRY ||
-            shader_type == PIPE_SHADER_TESS_EVAL ||
-            shader_type == PIPE_SHADER_VERTEX) &&
-           !key->as_es && !key->as_ls) {
-               fprintf(f, "  opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
-               fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
-               if (shader_type != PIPE_SHADER_GEOMETRY)
-                       fprintf(f, "  opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
-       }
+   const struct si_shader_key *key = &shader->key;
+   enum pipe_shader_type shader_type = shader->selector->type;
+
+   fprintf(f, "SHADER KEY\n");
+
+   switch (shader_type) {
+   case PIPE_SHADER_VERTEX:
+      si_dump_shader_key_vs(key, &key->part.vs.prolog, "part.vs.prolog", f);
+      fprintf(f, "  as_es = %u\n", key->as_es);
+      fprintf(f, "  as_ls = %u\n", key->as_ls);
+      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
+      fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
+      fprintf(f, "  opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);
+      fprintf(f, "  opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);
+      fprintf(f, "  opt.cs_indexed = %u\n", key->opt.cs_indexed);
+      fprintf(f, "  opt.cs_instancing = %u\n", key->opt.cs_instancing);
+      fprintf(f, "  opt.cs_primitive_restart = %u\n", key->opt.cs_primitive_restart);
+      fprintf(f, "  opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);
+      fprintf(f, "  opt.cs_need_correct_orientation = %u\n", key->opt.cs_need_correct_orientation);
+      fprintf(f, "  opt.cs_cull_front = %u\n", key->opt.cs_cull_front);
+      fprintf(f, "  opt.cs_cull_back = %u\n", key->opt.cs_cull_back);
+      fprintf(f, "  opt.cs_cull_z = %u\n", key->opt.cs_cull_z);
+      fprintf(f, "  opt.cs_halfz_clip_space = %u\n", key->opt.cs_halfz_clip_space);
+      break;
+
+   case PIPE_SHADER_TESS_CTRL:
+      if (shader->selector->screen->info.chip_class >= GFX9) {
+         si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, "part.tcs.ls_prolog", f);
+      }
+      fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
+      fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n",
+              key->mono.u.ff_tcs_inputs_to_copy);
+      break;
+
+   case PIPE_SHADER_TESS_EVAL:
+      fprintf(f, "  as_es = %u\n", key->as_es);
+      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
+      fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
+      break;
+
+   case PIPE_SHADER_GEOMETRY:
+      if (shader->is_gs_copy_shader)
+         break;
+
+      if (shader->selector->screen->info.chip_class >= GFX9 &&
+          key->part.gs.es->type == PIPE_SHADER_VERTEX) {
+         si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, "part.gs.vs_prolog", f);
+      }
+      fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n",
+              key->part.gs.prolog.tri_strip_adj_fix);
+      fprintf(f, "  part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
+      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
+      break;
+
+   case PIPE_SHADER_COMPUTE:
+      break;
+
+   case PIPE_SHADER_FRAGMENT:
+      fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
+      fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
+      fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
+      fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n",
+              key->part.ps.prolog.force_persp_sample_interp);
+      fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n",
+              key->part.ps.prolog.force_linear_sample_interp);
+      fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n",
+              key->part.ps.prolog.force_persp_center_interp);
+      fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n",
+              key->part.ps.prolog.force_linear_center_interp);
+      fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n",
+              key->part.ps.prolog.bc_optimize_for_persp);
+      fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n",
+              key->part.ps.prolog.bc_optimize_for_linear);
+      fprintf(f, "  part.ps.prolog.samplemask_log_ps_iter = %u\n",
+              key->part.ps.prolog.samplemask_log_ps_iter);
+      fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n",
+              key->part.ps.epilog.spi_shader_col_format);
+      fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
+      fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
+      fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
+      fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
+      fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
+      fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n",
+              key->part.ps.epilog.poly_line_smoothing);
+      fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
+      fprintf(f, "  mono.u.ps.interpolate_at_sample_force_center = %u\n",
+              key->mono.u.ps.interpolate_at_sample_force_center);
+      fprintf(f, "  mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
+      fprintf(f, "  mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
+      fprintf(f, "  mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   if ((shader_type == PIPE_SHADER_GEOMETRY || shader_type == PIPE_SHADER_TESS_EVAL ||
+        shader_type == PIPE_SHADER_VERTEX) &&
+       !key->as_es && !key->as_ls) {
+      fprintf(f, "  opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs);
+      fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
+      if (shader_type != PIPE_SHADER_GEOMETRY)
+         fprintf(f, "  opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
+   }
  }
  
  static void si_optimize_vs_outputs(struct si_shader_context *ctx)
  {
-       struct si_shader *shader = ctx->shader;
-       struct si_shader_info *info = &shader->selector->info;
-
-       if ((ctx->type != PIPE_SHADER_VERTEX &&
-            ctx->type != PIPE_SHADER_TESS_EVAL) ||
-           shader->key.as_ls ||
-           shader->key.as_es)
-               return;
-
-       ac_optimize_vs_outputs(&ctx->ac,
-                              ctx->main_fn,
-                              shader->info.vs_output_param_offset,
-                              info->num_outputs,
-                              &shader->info.nr_param_exports);
+   struct si_shader *shader = ctx->shader;
+   struct si_shader_info *info = &shader->selector->info;
+
+   if ((ctx->type != PIPE_SHADER_VERTEX && ctx->type != PIPE_SHADER_TESS_EVAL) ||
+       shader->key.as_ls || shader->key.as_es)
+      return;
+
+   ac_optimize_vs_outputs(&ctx->ac, ctx->main_fn, shader->info.vs_output_param_offset,
+                          info->num_outputs, &shader->info.nr_param_exports);
  }
  
  static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
-                              const struct si_vs_prolog_bits *prolog_key,
-                              const struct si_shader_key *key,
-                              bool ngg_cull_shader)
+                               const struct si_vs_prolog_bits *prolog_key,
+                               const struct si_shader_key *key, bool ngg_cull_shader)
  {
-       /* VGPR initialization fixup for Vega10 and Raven is always done in the
-        * VS prolog. */
-       return sel->vs_needs_prolog ||
-              prolog_key->ls_vgpr_fix ||
-              prolog_key->unpack_instance_id_from_vertex_id ||
-              (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
+   /* VGPR initialization fixup for Vega10 and Raven is always done in the
+    * VS prolog. */
+   return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||
+          prolog_key->unpack_instance_id_from_vertex_id ||
+          (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
  }
  
-static bool si_build_main_function(struct si_shader_context *ctx,
-                                  struct si_shader *shader,
-                                  struct nir_shader *nir, bool free_nir,
-                                  bool ngg_cull_shader)
+static bool si_build_main_function(struct si_shader_context *ctx, struct si_shader *shader,
+                                   struct nir_shader *nir, bool free_nir, bool ngg_cull_shader)
  {
-       struct si_shader_selector *sel = shader->selector;
-       const struct si_shader_info *info = &sel->info;
-
-       ctx->shader = shader;
-       ctx->type = sel->type;
-
-       ctx->num_const_buffers = util_last_bit(info->const_buffers_declared);
-       ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared);
-
-       ctx->num_samplers = util_last_bit(info->samplers_declared);
-       ctx->num_images = util_last_bit(info->images_declared);
-
-       si_llvm_init_resource_callbacks(ctx);
-
-       switch (ctx->type) {
-       case PIPE_SHADER_VERTEX:
-               si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
-               break;
-       case PIPE_SHADER_TESS_CTRL:
-               si_llvm_init_tcs_callbacks(ctx);
-               break;
-       case PIPE_SHADER_TESS_EVAL:
-               si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
-               break;
-       case PIPE_SHADER_GEOMETRY:
-               si_llvm_init_gs_callbacks(ctx);
-               break;
-       case PIPE_SHADER_FRAGMENT:
-               si_llvm_init_ps_callbacks(ctx);
-               break;
-       case PIPE_SHADER_COMPUTE:
-               ctx->abi.load_local_group_size = si_llvm_get_block_size;
-               break;
-       default:
-               assert(!"Unsupported shader type");
-               return false;
-       }
-
-       si_create_function(ctx, ngg_cull_shader);
-
-       if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)
-               si_preload_esgs_ring(ctx);
-
-       if (ctx->type == PIPE_SHADER_GEOMETRY)
-               si_preload_gs_rings(ctx);
-       else if (ctx->type == PIPE_SHADER_TESS_EVAL)
-               si_llvm_preload_tes_rings(ctx);
-
-       if (ctx->type == PIPE_SHADER_TESS_CTRL &&
-           sel->info.tessfactors_are_def_in_all_invocs) {
-               for (unsigned i = 0; i < 6; i++) {
-                       ctx->invoc0_tess_factors[i] =
-                               ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-               }
-       }
-
-       if (ctx->type == PIPE_SHADER_GEOMETRY) {
-               for (unsigned i = 0; i < 4; i++) {
-                       ctx->gs_next_vertex[i] =
-                               ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
-               }
-               if (shader->key.as_ngg) {
-                       for (unsigned i = 0; i < 4; ++i) {
-                               ctx->gs_curprim_verts[i] =
-                                       ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
-                               ctx->gs_generated_prims[i] =
-                                       ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
-                       }
-
-                       unsigned scratch_size = 8;
-                       if (sel->so.num_outputs)
-                               scratch_size = 44;
-
-                       assert(!ctx->gs_ngg_scratch);
-                       LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, scratch_size);
-                       ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
-                               ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
-                       LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
-                       LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
-
-                       ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module,
-                               LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
-                       LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
-                       LLVMSetAlignment(ctx->gs_ngg_emit, 4);
-               }
-       }
-
-       if (ctx->type != PIPE_SHADER_GEOMETRY &&
-           (shader->key.as_ngg && !shader->key.as_es)) {
-               /* Unconditionally declare scratch space base for streamout and
-                * vertex compaction. Whether space is actually allocated is
-                * determined during linking / PM4 creation.
-                *
-                * Add an extra dword per vertex to ensure an odd stride, which
-                * avoids bank conflicts for SoA accesses.
-                */
-               if (!gfx10_is_ngg_passthrough(shader))
-                       si_llvm_declare_esgs_ring(ctx);
-
-               /* This is really only needed when streamout and / or vertex
-                * compaction is enabled.
-                */
-               if (!ctx->gs_ngg_scratch &&
-                   (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
-                       LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, 8);
-                       ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
-                               asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
-                       LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
-                       LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
-               }
-       }
-
-       /* For GFX9 merged shaders:
-        * - Set EXEC for the first shader. If the prolog is present, set
-        *   EXEC there instead.
-        * - Add a barrier before the second shader.
-        * - In the second shader, reset EXEC to ~0 and wrap the main part in
-        *   an if-statement. This is required for correctness in geometry
-        *   shaders, to ensure that empty GS waves do not send GS_EMIT and
-        *   GS_CUT messages.
-        *
-        * For monolithic merged shaders, the first shader is wrapped in an
-        * if-block together with its prolog in si_build_wrapper_function.
-        *
-        * NGG vertex and tess eval shaders running as the last
-        * vertex/geometry stage handle execution explicitly using
-        * if-statements.
-        */
-       if (ctx->screen->info.chip_class >= GFX9) {
-               if (!shader->is_monolithic &&
-                   (shader->key.as_es || shader->key.as_ls) &&
-                   (ctx->type == PIPE_SHADER_TESS_EVAL ||
-                    (ctx->type == PIPE_SHADER_VERTEX &&
-                     !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
-                                         &shader->key, ngg_cull_shader)))) {
-                       si_init_exec_from_input(ctx,
-                                               ctx->merged_wave_info, 0);
-               } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
-                          ctx->type == PIPE_SHADER_GEOMETRY ||
-                          (shader->key.as_ngg && !shader->key.as_es)) {
-                       LLVMValueRef thread_enabled;
-                       bool nested_barrier;
-
-                       if (!shader->is_monolithic ||
-                           (ctx->type == PIPE_SHADER_TESS_EVAL &&
-                            shader->key.as_ngg && !shader->key.as_es &&
-                            !shader->key.opt.ngg_culling))
-                               ac_init_exec_full_mask(&ctx->ac);
-
-                       if ((ctx->type == PIPE_SHADER_VERTEX ||
-                            ctx->type == PIPE_SHADER_TESS_EVAL) &&
-                           shader->key.as_ngg && !shader->key.as_es &&
-                           !shader->key.opt.ngg_culling) {
-                               gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
-
-                               /* Build the primitive export at the beginning
-                                * of the shader if possible.
-                                */
-                               if (gfx10_ngg_export_prim_early(shader))
-                                       gfx10_ngg_build_export_prim(ctx, NULL, NULL);
-                       }
-
-                       if (ctx->type == PIPE_SHADER_TESS_CTRL ||
-                           ctx->type == PIPE_SHADER_GEOMETRY) {
-                               if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
-                                       gfx10_ngg_gs_emit_prologue(ctx);
-                                       nested_barrier = false;
-                               } else {
-                                       nested_barrier = true;
-                               }
-
-                               thread_enabled = si_is_gs_thread(ctx);
-                       } else {
-                               thread_enabled = si_is_es_thread(ctx);
-                               nested_barrier = false;
-                       }
-
-                       ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
-                       ctx->merged_wrap_if_label = 11500;
-                       ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
-
-                       if (nested_barrier) {
-                               /* Execute a barrier before the second shader in
-                                * a merged shader.
-                                *
-                                * Execute the barrier inside the conditional block,
-                                * so that empty waves can jump directly to s_endpgm,
-                                * which will also signal the barrier.
-                                *
-                                * This is possible in gfx9, because an empty wave
-                                * for the second shader does not participate in
-                                * the epilogue. With NGG, empty waves may still
-                                * be required to export data (e.g. GS output vertices),
-                                * so we cannot let them exit early.
-                                *
-                                * If the shader is TCS and the TCS epilog is present
-                                * and contains a barrier, it will wait there and then
-                                * reach s_endpgm.
-                                */
-                               si_llvm_emit_barrier(ctx);
-                       }
-               }
-       }
-
-       bool success = si_nir_build_llvm(ctx, nir);
-       if (free_nir)
-               ralloc_free(nir);
-       if (!success) {
-               fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
-               return false;
-       }
-
-       si_llvm_build_ret(ctx, ctx->return_value);
-       return true;
+   struct si_shader_selector *sel = shader->selector;
+   const struct si_shader_info *info = &sel->info;
+
+   ctx->shader = shader;
+   ctx->type = sel->type;
+
+   ctx->num_const_buffers = util_last_bit(info->const_buffers_declared);
+   ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared);
+
+   ctx->num_samplers = util_last_bit(info->samplers_declared);
+   ctx->num_images = util_last_bit(info->images_declared);
+
+   si_llvm_init_resource_callbacks(ctx);
+
+   switch (ctx->type) {
+   case PIPE_SHADER_VERTEX:
+      si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      si_llvm_init_tcs_callbacks(ctx);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      si_llvm_init_gs_callbacks(ctx);
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      si_llvm_init_ps_callbacks(ctx);
+      break;
+   case PIPE_SHADER_COMPUTE:
+      ctx->abi.load_local_group_size = si_llvm_get_block_size;
+      break;
+   default:
+      assert(!"Unsupported shader type");
+      return false;
+   }
+
+   si_create_function(ctx, ngg_cull_shader);
+
+   if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)
+      si_preload_esgs_ring(ctx);
+
+   if (ctx->type == PIPE_SHADER_GEOMETRY)
+      si_preload_gs_rings(ctx);
+   else if (ctx->type == PIPE_SHADER_TESS_EVAL)
+      si_llvm_preload_tes_rings(ctx);
+
+   if (ctx->type == PIPE_SHADER_TESS_CTRL && sel->info.tessfactors_are_def_in_all_invocs) {
+      for (unsigned i = 0; i < 6; i++) {
+         ctx->invoc0_tess_factors[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+      }
+   }
+
+   if (ctx->type == PIPE_SHADER_GEOMETRY) {
+      for (unsigned i = 0; i < 4; i++) {
+         ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+      }
+      if (shader->key.as_ngg) {
+         for (unsigned i = 0; i < 4; ++i) {
+            ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+            ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+         }
+
+         unsigned scratch_size = 8;
+         if (sel->so.num_outputs)
+            scratch_size = 44;
+
+         assert(!ctx->gs_ngg_scratch);
+         LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, scratch_size);
+         ctx->gs_ngg_scratch =
+            LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
+         LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
+         LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
+
+         ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(
+            ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
+         LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
+         LLVMSetAlignment(ctx->gs_ngg_emit, 4);
+      }
+   }
+
+   if (ctx->type != PIPE_SHADER_GEOMETRY && (shader->key.as_ngg && !shader->key.as_es)) {
+      /* Unconditionally declare scratch space base for streamout and
+       * vertex compaction. Whether space is actually allocated is
+       * determined during linking / PM4 creation.
+       *
+       * Add an extra dword per vertex to ensure an odd stride, which
+       * avoids bank conflicts for SoA accesses.
+       */
+      if (!gfx10_is_ngg_passthrough(shader))
+         si_llvm_declare_esgs_ring(ctx);
+
+      /* This is really only needed when streamout and / or vertex
+       * compaction is enabled.
+       */
+      if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
+         LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, 8);
+         ctx->gs_ngg_scratch =
+            LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
+         LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
+         LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
+      }
+   }
+
+   /* For GFX9 merged shaders:
+    * - Set EXEC for the first shader. If the prolog is present, set
+    *   EXEC there instead.
+    * - Add a barrier before the second shader.
+    * - In the second shader, reset EXEC to ~0 and wrap the main part in
+    *   an if-statement. This is required for correctness in geometry
+    *   shaders, to ensure that empty GS waves do not send GS_EMIT and
+    *   GS_CUT messages.
+    *
+    * For monolithic merged shaders, the first shader is wrapped in an
+    * if-block together with its prolog in si_build_wrapper_function.
+    *
+    * NGG vertex and tess eval shaders running as the last
+    * vertex/geometry stage handle execution explicitly using
+    * if-statements.
+    */
+   if (ctx->screen->info.chip_class >= GFX9) {
+      if (!shader->is_monolithic && (shader->key.as_es || shader->key.as_ls) &&
+          (ctx->type == PIPE_SHADER_TESS_EVAL ||
+           (ctx->type == PIPE_SHADER_VERTEX &&
+            !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) {
+         si_init_exec_from_input(ctx, ctx->merged_wave_info, 0);
+      } else if (ctx->type == PIPE_SHADER_TESS_CTRL || ctx->type == PIPE_SHADER_GEOMETRY ||
+                 (shader->key.as_ngg && !shader->key.as_es)) {
+         LLVMValueRef thread_enabled;
+         bool nested_barrier;
+
+         if (!shader->is_monolithic || (ctx->type == PIPE_SHADER_TESS_EVAL && shader->key.as_ngg &&
+                                        !shader->key.as_es && !shader->key.opt.ngg_culling))
+            ac_init_exec_full_mask(&ctx->ac);
+
+         if ((ctx->type == PIPE_SHADER_VERTEX || ctx->type == PIPE_SHADER_TESS_EVAL) &&
+             shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
+            gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
+
+            /* Build the primitive export at the beginning
+             * of the shader if possible.
+             */
+            if (gfx10_ngg_export_prim_early(shader))
+               gfx10_ngg_build_export_prim(ctx, NULL, NULL);
+         }
+
+         if (ctx->type == PIPE_SHADER_TESS_CTRL || ctx->type == PIPE_SHADER_GEOMETRY) {
+            if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
+               gfx10_ngg_gs_emit_prologue(ctx);
+               nested_barrier = false;
+            } else {
+               nested_barrier = true;
+            }
+
+            thread_enabled = si_is_gs_thread(ctx);
+         } else {
+            thread_enabled = si_is_es_thread(ctx);
+            nested_barrier = false;
+         }
+
+         ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
+         ctx->merged_wrap_if_label = 11500;
+         ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
+
+         if (nested_barrier) {
+            /* Execute a barrier before the second shader in
+             * a merged shader.
+             *
+             * Execute the barrier inside the conditional block,
+             * so that empty waves can jump directly to s_endpgm,
+             * which will also signal the barrier.
+             *
+             * This is possible in gfx9, because an empty wave
+             * for the second shader does not participate in
+             * the epilogue. With NGG, empty waves may still
+             * be required to export data (e.g. GS output vertices),
+             * so we cannot let them exit early.
+             *
+             * If the shader is TCS and the TCS epilog is present
+             * and contains a barrier, it will wait there and then
+             * reach s_endpgm.
+             */
+            si_llvm_emit_barrier(ctx);
+         }
+      }
+   }
+
+   bool success = si_nir_build_llvm(ctx, nir);
+   if (free_nir)
+      ralloc_free(nir);
+   if (!success) {
+      fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
+      return false;
+   }
+
+   si_llvm_build_ret(ctx, ctx->return_value);
+   return true;
  }
  
  /**
@@ -1622,425 +1510,385 @@ static bool si_build_main_function(struct si_shader_context *ctx,
   * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
   * \param key              Output shader part key.
   */
-static void si_get_vs_prolog_key(const struct si_shader_info *info,
-                                unsigned num_input_sgprs,
-                                bool ngg_cull_shader,
-                                const struct si_vs_prolog_bits *prolog_key,
-                                struct si_shader *shader_out,
-                                union si_shader_part_key *key)
+static void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
+                                 bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
+                                 struct si_shader *shader_out, union si_shader_part_key *key)
  {
-       memset(key, 0, sizeof(*key));
-       key->vs_prolog.states = *prolog_key;
-       key->vs_prolog.num_input_sgprs = num_input_sgprs;
-       key->vs_prolog.num_inputs = info->num_inputs;
-       key->vs_prolog.as_ls = shader_out->key.as_ls;
-       key->vs_prolog.as_es = shader_out->key.as_es;
-       key->vs_prolog.as_ngg = shader_out->key.as_ngg;
-       key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
-
-       if (ngg_cull_shader) {
-               key->vs_prolog.gs_fast_launch_tri_list = !!(shader_out->key.opt.ngg_culling &
-                                                           SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
-               key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling &
-                                                            SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
-       } else {
-               key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
-       }
-
-       if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
-               key->vs_prolog.as_ls = 1;
-               key->vs_prolog.num_merged_next_stage_vgprs = 2;
-       } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
-               key->vs_prolog.as_es = 1;
-               key->vs_prolog.num_merged_next_stage_vgprs = 5;
-       } else if (shader_out->key.as_ngg) {
-               key->vs_prolog.num_merged_next_stage_vgprs = 5;
-       }
-
-       /* Only one of these combinations can be set. as_ngg can be set with as_es. */
-       assert(key->vs_prolog.as_ls +
-              key->vs_prolog.as_ngg +
-              (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) +
-              key->vs_prolog.as_prim_discard_cs <= 1);
-
-       /* Enable loading the InstanceID VGPR. */
-       uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
-
-       if ((key->vs_prolog.states.instance_divisor_is_one |
-            key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
-               shader_out->info.uses_instanceid = true;
+   memset(key, 0, sizeof(*key));
+   key->vs_prolog.states = *prolog_key;
+   key->vs_prolog.num_input_sgprs = num_input_sgprs;
+   key->vs_prolog.num_inputs = info->num_inputs;
+   key->vs_prolog.as_ls = shader_out->key.as_ls;
+   key->vs_prolog.as_es = shader_out->key.as_es;
+   key->vs_prolog.as_ngg = shader_out->key.as_ngg;
+   key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
+
+   if (ngg_cull_shader) {
+      key->vs_prolog.gs_fast_launch_tri_list =
+         !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
+      key->vs_prolog.gs_fast_launch_tri_strip =
+         !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
+   } else {
+      key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
+   }
+
+   if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
+      key->vs_prolog.as_ls = 1;
+      key->vs_prolog.num_merged_next_stage_vgprs = 2;
+   } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
+      key->vs_prolog.as_es = 1;
+      key->vs_prolog.num_merged_next_stage_vgprs = 5;
+   } else if (shader_out->key.as_ngg) {
+      key->vs_prolog.num_merged_next_stage_vgprs = 5;
+   }
+
+   /* Only one of these combinations can be set. as_ngg can be set with as_es. */
+   assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
+             (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=
+          1);
+
+   /* Enable loading the InstanceID VGPR. */
+   uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
+
+   if ((key->vs_prolog.states.instance_divisor_is_one |
+        key->vs_prolog.states.instance_divisor_is_fetched) &
+       input_mask)
+      shader_out->info.uses_instanceid = true;
  }
  
  static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
-                                   struct si_shader_selector *sel)
+                                    struct si_shader_selector *sel)
  {
-       if (!compiler->low_opt_passes)
-               return false;
+   if (!compiler->low_opt_passes)
+      return false;
  
-       /* Assume a slow CPU. */
-       assert(!sel->screen->info.has_dedicated_vram &&
-              sel->screen->info.chip_class <= GFX8);
+   /* Assume a slow CPU. */
+   assert(!sel->screen->info.has_dedicated_vram && sel->screen->info.chip_class <= GFX8);
  
-       /* For a crazy dEQP test containing 2597 memory opcodes, mostly
-        * buffer stores. */
-       return sel->type == PIPE_SHADER_COMPUTE &&
-              sel->info.num_memory_instructions > 1000;
+   /* For a crazy dEQP test containing 2597 memory opcodes, mostly
+    * buffer stores. */
+   return sel->type == PIPE_SHADER_COMPUTE && sel->info.num_memory_instructions > 1000;
  }
  
-static struct nir_shader *get_nir_shader(struct si_shader_selector *sel,
-                                        bool *free_nir)
+static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, bool *free_nir)
  {
-       *free_nir = false;
-
-       if (sel->nir) {
-               return sel->nir;
-       } else if (sel->nir_binary) {
-               struct pipe_screen *screen = &sel->screen->b;
-               const void *options =
-                       screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
-                                                    sel->type);
-
-               struct blob_reader blob_reader;
-               blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
-               *free_nir = true;
-               return nir_deserialize(NULL, options, &blob_reader);
-       }
-       return NULL;
+   *free_nir = false;
+
+   if (sel->nir) {
+      return sel->nir;
+   } else if (sel->nir_binary) {
+      struct pipe_screen *screen = &sel->screen->b;
+      const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, sel->type);
+
+      struct blob_reader blob_reader;
+      blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
+      *free_nir = true;
+      return nir_deserialize(NULL, options, &blob_reader);
+   }
+   return NULL;
  }
  
-static bool si_llvm_compile_shader(struct si_screen *sscreen,
-                                  struct ac_llvm_compiler *compiler,
-                                  struct si_shader *shader,
-                                  struct pipe_debug_callback *debug,
-                                  struct nir_shader *nir,
-                                  bool free_nir)
+static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                                   struct si_shader *shader, struct pipe_debug_callback *debug,
+                                   struct nir_shader *nir, bool free_nir)
  {
-       struct si_shader_selector *sel = shader->selector;
-       struct si_shader_context ctx;
-
-       si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
-
-       LLVMValueRef ngg_cull_main_fn = NULL;
-       if (shader->key.opt.ngg_culling) {
-               if (!si_build_main_function(&ctx, shader, nir, false, true)) {
-                       si_llvm_dispose(&ctx);
-                       return false;
-               }
-               ngg_cull_main_fn = ctx.main_fn;
-               ctx.main_fn = NULL;
-       }
-
-       if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) {
-               si_llvm_dispose(&ctx);
-               return false;
-       }
-
-       if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
-               LLVMValueRef parts[4];
-               unsigned num_parts = 0;
-               bool has_prolog = false;
-               LLVMValueRef main_fn = ctx.main_fn;
-
-               if (ngg_cull_main_fn) {
-                       if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
-                                              &shader->key, true)) {
-                               union si_shader_part_key prolog_key;
-                               si_get_vs_prolog_key(&sel->info,
-                                                    shader->info.num_input_sgprs,
-                                                    true,
-                                                    &shader->key.part.vs.prolog,
-                                                    shader, &prolog_key);
-                               prolog_key.vs_prolog.is_monolithic = true;
-                               si_llvm_build_vs_prolog(&ctx, &prolog_key);
-                               parts[num_parts++] = ctx.main_fn;
-                               has_prolog = true;
-                       }
-                       parts[num_parts++] = ngg_cull_main_fn;
-               }
-
-               if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
-                                      &shader->key, false)) {
-                       union si_shader_part_key prolog_key;
-                       si_get_vs_prolog_key(&sel->info,
-                                            shader->info.num_input_sgprs,
-                                            false,
-                                            &shader->key.part.vs.prolog,
-                                            shader, &prolog_key);
-                       prolog_key.vs_prolog.is_monolithic = true;
-                       si_llvm_build_vs_prolog(&ctx, &prolog_key);
-                       parts[num_parts++] = ctx.main_fn;
-                       has_prolog = true;
-               }
-               parts[num_parts++] = main_fn;
-
-               si_build_wrapper_function(&ctx, parts, num_parts,
-                                         has_prolog ? 1 : 0, 0);
-
-               if (ctx.shader->key.opt.vs_as_prim_discard_cs)
-                       si_build_prim_discard_compute_shader(&ctx);
-       } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
-                  ngg_cull_main_fn) {
-               LLVMValueRef parts[2];
-
-               parts[0] = ngg_cull_main_fn;
-               parts[1] = ctx.main_fn;
-
-               si_build_wrapper_function(&ctx, parts, 2, 0, 0);
-       } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
-               if (sscreen->info.chip_class >= GFX9) {
-                       struct si_shader_selector *ls = shader->key.part.tcs.ls;
-                       LLVMValueRef parts[4];
-                       bool vs_needs_prolog =
-                               si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog,
-                                                  &shader->key, false);
-
-                       /* TCS main part */
-                       parts[2] = ctx.main_fn;
-
-                       /* TCS epilog */
-                       union si_shader_part_key tcs_epilog_key;
-                       memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
-                       tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
-                       si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
-                       parts[3] = ctx.main_fn;
-
-                       /* VS as LS main part */
-                       nir = get_nir_shader(ls, &free_nir);
-                       struct si_shader shader_ls = {};
-                       shader_ls.selector = ls;
-                       shader_ls.key.as_ls = 1;
-                       shader_ls.key.mono = shader->key.mono;
-                       shader_ls.key.opt = shader->key.opt;
-                       shader_ls.is_monolithic = true;
-
-                       if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) {
-                               si_llvm_dispose(&ctx);
-                               return false;
-                       }
-                       shader->info.uses_instanceid |= ls->info.uses_instanceid;
-                       parts[1] = ctx.main_fn;
-
-                       /* LS prolog */
-                       if (vs_needs_prolog) {
-                               union si_shader_part_key vs_prolog_key;
-                               si_get_vs_prolog_key(&ls->info,
-                                                    shader_ls.info.num_input_sgprs,
-                                                    false,
-                                                    &shader->key.part.tcs.ls_prolog,
-                                                    shader, &vs_prolog_key);
-                               vs_prolog_key.vs_prolog.is_monolithic = true;
-                               si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
-                               parts[0] = ctx.main_fn;
-                       }
-
-                       /* Reset the shader context. */
-                       ctx.shader = shader;
-                       ctx.type = PIPE_SHADER_TESS_CTRL;
-
-                       si_build_wrapper_function(&ctx,
-                                                 parts + !vs_needs_prolog,
-                                                 4 - !vs_needs_prolog, vs_needs_prolog,
-                                                 vs_needs_prolog ? 2 : 1);
-               } else {
-                       LLVMValueRef parts[2];
-                       union si_shader_part_key epilog_key;
-
-                       parts[0] = ctx.main_fn;
-
-                       memset(&epilog_key, 0, sizeof(epilog_key));
-                       epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
-                       si_llvm_build_tcs_epilog(&ctx, &epilog_key);
-                       parts[1] = ctx.main_fn;
-
-                       si_build_wrapper_function(&ctx, parts, 2, 0, 0);
-               }
-       } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
-               if (ctx.screen->info.chip_class >= GFX9) {
-                       struct si_shader_selector *es = shader->key.part.gs.es;
-                       LLVMValueRef es_prolog = NULL;
-                       LLVMValueRef es_main = NULL;
-                       LLVMValueRef gs_prolog = NULL;
-                       LLVMValueRef gs_main = ctx.main_fn;
-
-                       /* GS prolog */
-                       union si_shader_part_key gs_prolog_key;
-                       memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
-                       gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
-                       gs_prolog_key.gs_prolog.is_monolithic = true;
-                       gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
-                       si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
-                       gs_prolog = ctx.main_fn;
-
-                       /* ES main part */
-                       nir = get_nir_shader(es, &free_nir);
-                       struct si_shader shader_es = {};
-                       shader_es.selector = es;
-                       shader_es.key.as_es = 1;
-                       shader_es.key.as_ngg = shader->key.as_ngg;
-                       shader_es.key.mono = shader->key.mono;
-                       shader_es.key.opt = shader->key.opt;
-                       shader_es.is_monolithic = true;
-
-                       if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) {
-                               si_llvm_dispose(&ctx);
-                               return false;
-                       }
-                       shader->info.uses_instanceid |= es->info.uses_instanceid;
-                       es_main = ctx.main_fn;
-
-                       /* ES prolog */
-                       if (es->type == PIPE_SHADER_VERTEX &&
-                           si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog,
-                                              &shader->key, false)) {
-                               union si_shader_part_key vs_prolog_key;
-                               si_get_vs_prolog_key(&es->info,
-                                                    shader_es.info.num_input_sgprs,
-                                                    false,
-                                                    &shader->key.part.gs.vs_prolog,
-                                                    shader, &vs_prolog_key);
-                               vs_prolog_key.vs_prolog.is_monolithic = true;
-                               si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
-                               es_prolog = ctx.main_fn;
-                       }
-
-                       /* Reset the shader context. */
-                       ctx.shader = shader;
-                       ctx.type = PIPE_SHADER_GEOMETRY;
-
-                       /* Prepare the array of shader parts. */
-                       LLVMValueRef parts[4];
-                       unsigned num_parts = 0, main_part, next_first_part;
-
-                       if (es_prolog)
-                               parts[num_parts++] = es_prolog;
-
-                       parts[main_part = num_parts++] = es_main;
-                       parts[next_first_part = num_parts++] = gs_prolog;
-                       parts[num_parts++] = gs_main;
-
-                       si_build_wrapper_function(&ctx, parts, num_parts,
-                                                 main_part, next_first_part);
-               } else {
-                       LLVMValueRef parts[2];
-                       union si_shader_part_key prolog_key;
-
-                       parts[1] = ctx.main_fn;
-
-                       memset(&prolog_key, 0, sizeof(prolog_key));
-                       prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
-                       si_llvm_build_gs_prolog(&ctx, &prolog_key);
-                       parts[0] = ctx.main_fn;
-
-                       si_build_wrapper_function(&ctx, parts, 2, 1, 0);
-               }
-       } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
-               si_llvm_build_monolithic_ps(&ctx, shader);
-       }
-
-       si_llvm_optimize_module(&ctx);
-
-       /* Post-optimization transformations and analysis. */
-       si_optimize_vs_outputs(&ctx);
-
-       if ((debug && debug->debug_message) ||
-           si_can_dump_shader(sscreen, ctx.type)) {
-               ctx.shader->info.private_mem_vgprs =
-                       ac_count_scratch_private_memory(ctx.main_fn);
-       }
-
-       /* Make sure the input is a pointer and not integer followed by inttoptr. */
-       assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) ==
-              LLVMPointerTypeKind);
-
-       /* Compile to bytecode. */
-       if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
-                            &ctx.ac, debug, ctx.type, si_get_shader_name(shader),
-                            si_should_optimize_less(compiler, shader->selector))) {
-               si_llvm_dispose(&ctx);
-               fprintf(stderr, "LLVM failed to compile shader\n");
-               return false;
-       }
-
-       si_llvm_dispose(&ctx);
-       return true;
+   struct si_shader_selector *sel = shader->selector;
+   struct si_shader_context ctx;
+
+   si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
+
+   LLVMValueRef ngg_cull_main_fn = NULL;
+   if (shader->key.opt.ngg_culling) {
+      if (!si_build_main_function(&ctx, shader, nir, false, true)) {
+         si_llvm_dispose(&ctx);
+         return false;
+      }
+      ngg_cull_main_fn = ctx.main_fn;
+      ctx.main_fn = NULL;
+   }
+
+   if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) {
+      si_llvm_dispose(&ctx);
+      return false;
+   }
+
+   if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
+      LLVMValueRef parts[4];
+      unsigned num_parts = 0;
+      bool has_prolog = false;
+      LLVMValueRef main_fn = ctx.main_fn;
+
+      if (ngg_cull_main_fn) {
+         if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, true)) {
+            union si_shader_part_key prolog_key;
+            si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true,
+                                 &shader->key.part.vs.prolog, shader, &prolog_key);
+            prolog_key.vs_prolog.is_monolithic = true;
+            si_llvm_build_vs_prolog(&ctx, &prolog_key);
+            parts[num_parts++] = ctx.main_fn;
+            has_prolog = true;
+         }
+         parts[num_parts++] = ngg_cull_main_fn;
+      }
+
+      if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, false)) {
+         union si_shader_part_key prolog_key;
+         si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false,
+                              &shader->key.part.vs.prolog, shader, &prolog_key);
+         prolog_key.vs_prolog.is_monolithic = true;
+         si_llvm_build_vs_prolog(&ctx, &prolog_key);
+         parts[num_parts++] = ctx.main_fn;
+         has_prolog = true;
+      }
+      parts[num_parts++] = main_fn;
+
+      si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0);
+
+      if (ctx.shader->key.opt.vs_as_prim_discard_cs)
+         si_build_prim_discard_compute_shader(&ctx);
+   } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL && ngg_cull_main_fn) {
+      LLVMValueRef parts[2];
+
+      parts[0] = ngg_cull_main_fn;
+      parts[1] = ctx.main_fn;
+
+      si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+   } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
+      if (sscreen->info.chip_class >= GFX9) {
+         struct si_shader_selector *ls = shader->key.part.tcs.ls;
+         LLVMValueRef parts[4];
+         bool vs_needs_prolog =
+            si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, &shader->key, false);
+
+         /* TCS main part */
+         parts[2] = ctx.main_fn;
+
+         /* TCS epilog */
+         union si_shader_part_key tcs_epilog_key;
+         memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
+         tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+         si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
+         parts[3] = ctx.main_fn;
+
+         /* VS as LS main part */
+         nir = get_nir_shader(ls, &free_nir);
+         struct si_shader shader_ls = {};
+         shader_ls.selector = ls;
+         shader_ls.key.as_ls = 1;
+         shader_ls.key.mono = shader->key.mono;
+         shader_ls.key.opt = shader->key.opt;
+         shader_ls.is_monolithic = true;
+
+         if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) {
+            si_llvm_dispose(&ctx);
+            return false;
+         }
+         shader->info.uses_instanceid |= ls->info.uses_instanceid;
+         parts[1] = ctx.main_fn;
+
+         /* LS prolog */
+         if (vs_needs_prolog) {
+            union si_shader_part_key vs_prolog_key;
+            si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false,
+                                 &shader->key.part.tcs.ls_prolog, shader, &vs_prolog_key);
+            vs_prolog_key.vs_prolog.is_monolithic = true;
+            si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
+            parts[0] = ctx.main_fn;
+         }
+
+         /* Reset the shader context. */
+         ctx.shader = shader;
+         ctx.type = PIPE_SHADER_TESS_CTRL;
+
+         si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog,
+                                   vs_needs_prolog, vs_needs_prolog ? 2 : 1);
+      } else {
+         LLVMValueRef parts[2];
+         union si_shader_part_key epilog_key;
+
+         parts[0] = ctx.main_fn;
+
+         memset(&epilog_key, 0, sizeof(epilog_key));
+         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+         si_llvm_build_tcs_epilog(&ctx, &epilog_key);
+         parts[1] = ctx.main_fn;
+
+         si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+      }
+   } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
+      if (ctx.screen->info.chip_class >= GFX9) {
+         struct si_shader_selector *es = shader->key.part.gs.es;
+         LLVMValueRef es_prolog = NULL;
+         LLVMValueRef es_main = NULL;
+         LLVMValueRef gs_prolog = NULL;
+         LLVMValueRef gs_main = ctx.main_fn;
+
+         /* GS prolog */
+         union si_shader_part_key gs_prolog_key;
+         memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
+         gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
+         gs_prolog_key.gs_prolog.is_monolithic = true;
+         gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
+         si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
+         gs_prolog = ctx.main_fn;
+
+         /* ES main part */
+         nir = get_nir_shader(es, &free_nir);
+         struct si_shader shader_es = {};
+         shader_es.selector = es;
+         shader_es.key.as_es = 1;
+         shader_es.key.as_ngg = shader->key.as_ngg;
+         shader_es.key.mono = shader->key.mono;
+         shader_es.key.opt = shader->key.opt;
+         shader_es.is_monolithic = true;
+
+         if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) {
+            si_llvm_dispose(&ctx);
+            return false;
+         }
+         shader->info.uses_instanceid |= es->info.uses_instanceid;
+         es_main = ctx.main_fn;
+
+         /* ES prolog */
+         if (es->type == PIPE_SHADER_VERTEX &&
+             si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, &shader->key, false)) {
+            union si_shader_part_key vs_prolog_key;
+            si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false,
+                                 &shader->key.part.gs.vs_prolog, shader, &vs_prolog_key);
+            vs_prolog_key.vs_prolog.is_monolithic = true;
+            si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
+            es_prolog = ctx.main_fn;
+         }
+
+         /* Reset the shader context. */
+         ctx.shader = shader;
+         ctx.type = PIPE_SHADER_GEOMETRY;
+
+         /* Prepare the array of shader parts. */
+         LLVMValueRef parts[4];
+         unsigned num_parts = 0, main_part, next_first_part;
+
+         if (es_prolog)
+            parts[num_parts++] = es_prolog;
+
+         parts[main_part = num_parts++] = es_main;
+         parts[next_first_part = num_parts++] = gs_prolog;
+         parts[num_parts++] = gs_main;
+
+         si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part);
+      } else {
+         LLVMValueRef parts[2];
+         union si_shader_part_key prolog_key;
+
+         parts[1] = ctx.main_fn;
+
+         memset(&prolog_key, 0, sizeof(prolog_key));
+         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
+         si_llvm_build_gs_prolog(&ctx, &prolog_key);
+         parts[0] = ctx.main_fn;
+
+         si_build_wrapper_function(&ctx, parts, 2, 1, 0);
+      }
+   } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
+      si_llvm_build_monolithic_ps(&ctx, shader);
+   }
+
+   si_llvm_optimize_module(&ctx);
+
+   /* Post-optimization transformations and analysis. */
+   si_optimize_vs_outputs(&ctx);
+
+   if ((debug && debug->debug_message) || si_can_dump_shader(sscreen, ctx.type)) {
+      ctx.shader->info.private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_fn);
+   }
+
+   /* Make sure the input is a pointer and not integer followed by inttoptr. */
+   assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
+
+   /* Compile to bytecode. */
+   if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
+                        ctx.type, si_get_shader_name(shader),
+                        si_should_optimize_less(compiler, shader->selector))) {
+      si_llvm_dispose(&ctx);
+      fprintf(stderr, "LLVM failed to compile shader\n");
+      return false;
+   }
+
+   si_llvm_dispose(&ctx);
+   return true;
  }
  
-bool si_compile_shader(struct si_screen *sscreen,
-                      struct ac_llvm_compiler *compiler,
-                      struct si_shader *shader,
-                      struct pipe_debug_callback *debug)
+bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                       struct si_shader *shader, struct pipe_debug_callback *debug)
  {
-       struct si_shader_selector *sel = shader->selector;
-       bool free_nir;
-       struct nir_shader *nir = get_nir_shader(sel, &free_nir);
-
-       /* Dump NIR before doing NIR->LLVM conversion in case the
-        * conversion fails. */
-       if (si_can_dump_shader(sscreen, sel->type) &&
-           !(sscreen->debug_flags & DBG(NO_NIR))) {
-               nir_print_shader(nir, stderr);
-               si_dump_streamout(&sel->so);
-       }
-
-       memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
-              sizeof(shader->info.vs_output_param_offset));
-
-       shader->info.uses_instanceid = sel->info.uses_instanceid;
-
-       /* TODO: ACO could compile non-monolithic shaders here (starting
-        * with PS and NGG VS), but monolithic shaders should be compiled
-        * by LLVM due to more complicated compilation.
-        */
-       if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
-               return false;
-
-       /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
-        * LLVM 3.9svn has this bug.
-        */
-       if (sel->type == PIPE_SHADER_COMPUTE) {
-               unsigned wave_size = sscreen->compute_wave_size;
-               unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd *
-                                    (wave_size == 32 ? 2 : 1);
-               unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
-               unsigned max_sgprs_per_wave = 128;
-               unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
-               unsigned threads_per_tg = si_get_max_workgroup_size(shader);
-               unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
-               unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
-
-               max_vgprs = max_vgprs / waves_per_simd;
-               max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
-
-               if (shader->config.num_sgprs > max_sgprs ||
-                   shader->config.num_vgprs > max_vgprs) {
-                       fprintf(stderr, "LLVM failed to compile a shader correctly: "
-                               "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
-                               shader->config.num_sgprs, shader->config.num_vgprs,
-                               max_sgprs, max_vgprs);
-
-                       /* Just terminate the process, because dependent
-                        * shaders can hang due to bad input data, but use
-                        * the env var to allow shader-db to work.
-                        */
-                       if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
-                               abort();
-               }
-       }
-
-       /* Add the scratch offset to input SGPRs. */
-       if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))
-               shader->info.num_input_sgprs += 1; /* scratch byte offset */
-
-       /* Calculate the number of fragment input VGPRs. */
-       if (sel->type == PIPE_SHADER_FRAGMENT) {
-               shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(&shader->config,
-                                               &shader->info.face_vgpr_index,
-                                               &shader->info.ancillary_vgpr_index);
-       }
-
-       si_calculate_max_simd_waves(shader);
-       si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
-       return true;
+   struct si_shader_selector *sel = shader->selector;
+   bool free_nir;
+   struct nir_shader *nir = get_nir_shader(sel, &free_nir);
+
+   /* Dump NIR before doing NIR->LLVM conversion in case the
+    * conversion fails. */
+   if (si_can_dump_shader(sscreen, sel->type) && !(sscreen->debug_flags & DBG(NO_NIR))) {
+      nir_print_shader(nir, stderr);
+      si_dump_streamout(&sel->so);
+   }
+
+   memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
+          sizeof(shader->info.vs_output_param_offset));
+
+   shader->info.uses_instanceid = sel->info.uses_instanceid;
+
+   /* TODO: ACO could compile non-monolithic shaders here (starting
+    * with PS and NGG VS), but monolithic shaders should be compiled
+    * by LLVM due to more complicated compilation.
+    */
+   if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
+      return false;
+
+   /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
+    * LLVM 3.9svn has this bug.
+    */
+   if (sel->type == PIPE_SHADER_COMPUTE) {
+      unsigned wave_size = sscreen->compute_wave_size;
+      unsigned max_vgprs =
+         sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1);
+      unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
+      unsigned max_sgprs_per_wave = 128;
+      unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
+      unsigned threads_per_tg = si_get_max_workgroup_size(shader);
+      unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
+      unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
+
+      max_vgprs = max_vgprs / waves_per_simd;
+      max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
+
+      if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
+         fprintf(stderr,
+                 "LLVM failed to compile a shader correctly: "
+                 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
+                 shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
+
+         /* Just terminate the process, because dependent
+          * shaders can hang due to bad input data, but use
+          * the env var to allow shader-db to work.
+          */
+         if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
+            abort();
+      }
+   }
+
+   /* Add the scratch offset to input SGPRs. */
+   if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))
+      shader->info.num_input_sgprs += 1; /* scratch byte offset */
+
+   /* Calculate the number of fragment input VGPRs. */
+   if (sel->type == PIPE_SHADER_FRAGMENT) {
+      shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
+         &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index);
+   }
+
+   si_calculate_max_simd_waves(shader);
+   si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
+   return true;
  }
  
  /**
@@ -2057,335 +1905,300 @@ bool si_compile_shader(struct si_screen *sscreen,
   * \return             non-NULL on success
   */
  static struct si_shader_part *
-si_get_shader_part(struct si_screen *sscreen,
-                  struct si_shader_part **list,
-                  enum pipe_shader_type type,
-                  bool prolog,
-                  union si_shader_part_key *key,
-                  struct ac_llvm_compiler *compiler,
-                  struct pipe_debug_callback *debug,
-                  void (*build)(struct si_shader_context *,
-                                union si_shader_part_key *),
-                  const char *name)
+si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
+                   enum pipe_shader_type type, bool prolog, union si_shader_part_key *key,
+                   struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug,
+                   void (*build)(struct si_shader_context *, union si_shader_part_key *),
+                   const char *name)
  {
-       struct si_shader_part *result;
-
-       simple_mtx_lock(&sscreen->shader_parts_mutex);
-
-       /* Find existing. */
-       for (result = *list; result; result = result->next) {
-               if (memcmp(&result->key, key, sizeof(*key)) == 0) {
-                       simple_mtx_unlock(&sscreen->shader_parts_mutex);
-                       return result;
-               }
-       }
-
-       /* Compile a new one. */
-       result = CALLOC_STRUCT(si_shader_part);
-       result->key = *key;
-
-       struct si_shader_selector sel = {};
-       sel.screen = sscreen;
-
-       struct si_shader shader = {};
-       shader.selector = &sel;
-
-       switch (type) {
-       case PIPE_SHADER_VERTEX:
-               shader.key.as_ls = key->vs_prolog.as_ls;
-               shader.key.as_es = key->vs_prolog.as_es;
-               shader.key.as_ngg = key->vs_prolog.as_ngg;
-               shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
-               break;
-       case PIPE_SHADER_TESS_CTRL:
-               assert(!prolog);
-               shader.key.part.tcs.epilog = key->tcs_epilog.states;
-               break;
-       case PIPE_SHADER_GEOMETRY:
-               assert(prolog);
-               shader.key.as_ngg = key->gs_prolog.as_ngg;
-               break;
-       case PIPE_SHADER_FRAGMENT:
-               if (prolog)
-                       shader.key.part.ps.prolog = key->ps_prolog.states;
-               else
-                       shader.key.part.ps.epilog = key->ps_epilog.states;
-               break;
-       default:
-               unreachable("bad shader part");
-       }
-
-       struct si_shader_context ctx;
-       si_llvm_context_init(&ctx, sscreen, compiler,
-                            si_get_wave_size(sscreen, type, shader.key.as_ngg,
-                                             shader.key.as_es,
-                                             shader.key.opt.vs_as_prim_discard_cs));
-       ctx.shader = &shader;
-       ctx.type = type;
-
-       build(&ctx, key);
-
-       /* Compile. */
-       si_llvm_optimize_module(&ctx);
-
-       if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
-                            &ctx.ac, debug, ctx.type, name, false)) {
-               FREE(result);
-               result = NULL;
-               goto out;
-       }
-
-       result->next = *list;
-       *list = result;
+   struct si_shader_part *result;
+
+   simple_mtx_lock(&sscreen->shader_parts_mutex);
+
+   /* Find existing. */
+   for (result = *list; result; result = result->next) {
+      if (memcmp(&result->key, key, sizeof(*key)) == 0) {
+         simple_mtx_unlock(&sscreen->shader_parts_mutex);
+         return result;
+      }
+   }
+
+   /* Compile a new one. */
+   result = CALLOC_STRUCT(si_shader_part);
+   result->key = *key;
+
+   struct si_shader_selector sel = {};
+   sel.screen = sscreen;
+
+   struct si_shader shader = {};
+   shader.selector = &sel;
+
+   switch (type) {
+   case PIPE_SHADER_VERTEX:
+      shader.key.as_ls = key->vs_prolog.as_ls;
+      shader.key.as_es = key->vs_prolog.as_es;
+      shader.key.as_ngg = key->vs_prolog.as_ngg;
+      shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      assert(!prolog);
+      shader.key.part.tcs.epilog = key->tcs_epilog.states;
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      assert(prolog);
+      shader.key.as_ngg = key->gs_prolog.as_ngg;
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      if (prolog)
+         shader.key.part.ps.prolog = key->ps_prolog.states;
+      else
+         shader.key.part.ps.epilog = key->ps_epilog.states;
+      break;
+   default:
+      unreachable("bad shader part");
+   }
+
+   struct si_shader_context ctx;
+   si_llvm_context_init(&ctx, sscreen, compiler,
+                        si_get_wave_size(sscreen, type, shader.key.as_ngg, shader.key.as_es,
+                                         shader.key.opt.vs_as_prim_discard_cs));
+   ctx.shader = &shader;
+   ctx.type = type;
+
+   build(&ctx, key);
+
+   /* Compile. */
+   si_llvm_optimize_module(&ctx);
+
+   if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug,
+                        ctx.type, name, false)) {
+      FREE(result);
+      result = NULL;
+      goto out;
+   }
+
+   result->next = *list;
+   *list = result;
  
  out:
-       si_llvm_dispose(&ctx);
-       simple_mtx_unlock(&sscreen->shader_parts_mutex);
-       return result;
+   si_llvm_dispose(&ctx);
+   simple_mtx_unlock(&sscreen->shader_parts_mutex);
+   return result;
  }
  
-static bool si_get_vs_prolog(struct si_screen *sscreen,
-                            struct ac_llvm_compiler *compiler,
-                            struct si_shader *shader,
-                            struct pipe_debug_callback *debug,
-                            struct si_shader *main_part,
-                            const struct si_vs_prolog_bits *key)
+static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                             struct si_shader *shader, struct pipe_debug_callback *debug,
+                             struct si_shader *main_part, const struct si_vs_prolog_bits *key)
  {
-       struct si_shader_selector *vs = main_part->selector;
-
-       if (!si_vs_needs_prolog(vs, key, &shader->key, false))
-               return true;
-
-       /* Get the prolog. */
-       union si_shader_part_key prolog_key;
-       si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false,
-                            key, shader, &prolog_key);
-
-       shader->prolog =
-               si_get_shader_part(sscreen, &sscreen->vs_prologs,
-                                  PIPE_SHADER_VERTEX, true, &prolog_key, compiler,
-                                  debug, si_llvm_build_vs_prolog,
-                                  "Vertex Shader Prolog");
-       return shader->prolog != NULL;
+   struct si_shader_selector *vs = main_part->selector;
+
+   if (!si_vs_needs_prolog(vs, key, &shader->key, false))
+      return true;
+
+   /* Get the prolog. */
+   union si_shader_part_key prolog_key;
+   si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader,
+                        &prolog_key);
+
+   shader->prolog =
+      si_get_shader_part(sscreen, &sscreen->vs_prologs, PIPE_SHADER_VERTEX, true, &prolog_key,
+                         compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog");
+   return shader->prolog != NULL;
  }
  
  /**
   * Select and compile (or reuse) vertex shader parts (prolog & epilog).
   */
-static bool si_shader_select_vs_parts(struct si_screen *sscreen,
-                                     struct ac_llvm_compiler *compiler,
-                                     struct si_shader *shader,
-                                     struct pipe_debug_callback *debug)
+static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                                      struct si_shader *shader, struct pipe_debug_callback *debug)
  {
-       return si_get_vs_prolog(sscreen, compiler, shader, debug, shader,
-                               &shader->key.part.vs.prolog);
+   return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.part.vs.prolog);
  }
  
  /**
   * Select and compile (or reuse) TCS parts (epilog).
   */
-static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
-                                      struct ac_llvm_compiler *compiler,
-                                      struct si_shader *shader,
-                                      struct pipe_debug_callback *debug)
+static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                                       struct si_shader *shader, struct pipe_debug_callback *debug)
  {
-       if (sscreen->info.chip_class >= GFX9) {
-               struct si_shader *ls_main_part =
-                       shader->key.part.tcs.ls->main_shader_part_ls;
-
-               if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
-                                     &shader->key.part.tcs.ls_prolog))
-                       return false;
-
-               shader->previous_stage = ls_main_part;
-       }
-
-       /* Get the epilog. */
-       union si_shader_part_key epilog_key;
-       memset(&epilog_key, 0, sizeof(epilog_key));
-       epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
-
-       shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
-                                           PIPE_SHADER_TESS_CTRL, false,
-                                           &epilog_key, compiler, debug,
-                                           si_llvm_build_tcs_epilog,
-                                           "Tessellation Control Shader Epilog");
-       return shader->epilog != NULL;
+   if (sscreen->info.chip_class >= GFX9) {
+      struct si_shader *ls_main_part = shader->key.part.tcs.ls->main_shader_part_ls;
+
+      if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
+                            &shader->key.part.tcs.ls_prolog))
+         return false;
+
+      shader->previous_stage = ls_main_part;
+   }
+
+   /* Get the epilog. */
+   union si_shader_part_key epilog_key;
+   memset(&epilog_key, 0, sizeof(epilog_key));
+   epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+
+   shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, PIPE_SHADER_TESS_CTRL, false,
+                                       &epilog_key, compiler, debug, si_llvm_build_tcs_epilog,
+                                       "Tessellation Control Shader Epilog");
+   return shader->epilog != NULL;
  }
  
  /**
   * Select and compile (or reuse) GS parts (prolog).
   */
-static bool si_shader_select_gs_parts(struct si_screen *sscreen,
-                                     struct ac_llvm_compiler *compiler,
-                                     struct si_shader *shader,
-                                     struct pipe_debug_callback *debug)
+static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                                      struct si_shader *shader, struct pipe_debug_callback *debug)
  {
-       if (sscreen->info.chip_class >= GFX9) {
-               struct si_shader *es_main_part;
-               enum pipe_shader_type es_type = shader->key.part.gs.es->type;
-
-               if (shader->key.as_ngg)
-                       es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
-               else
-                       es_main_part = shader->key.part.gs.es->main_shader_part_es;
-
-               if (es_type == PIPE_SHADER_VERTEX &&
-                   !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
-                                     &shader->key.part.gs.vs_prolog))
-                       return false;
-
-               shader->previous_stage = es_main_part;
-       }
-
-       if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
-               return true;
-
-       union si_shader_part_key prolog_key;
-       memset(&prolog_key, 0, sizeof(prolog_key));
-       prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
-       prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
-
-       shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
-                                           PIPE_SHADER_GEOMETRY, true,
-                                           &prolog_key, compiler, debug,
-                                           si_llvm_build_gs_prolog,
-                                           "Geometry Shader Prolog");
-       return shader->prolog2 != NULL;
+   if (sscreen->info.chip_class >= GFX9) {
+      struct si_shader *es_main_part;
+      enum pipe_shader_type es_type = shader->key.part.gs.es->type;
+
+      if (shader->key.as_ngg)
+         es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
+      else
+         es_main_part = shader->key.part.gs.es->main_shader_part_es;
+
+      if (es_type == PIPE_SHADER_VERTEX &&
+          !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
+                            &shader->key.part.gs.vs_prolog))
+         return false;
+
+      shader->previous_stage = es_main_part;
+   }
+
+   if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
+      return true;
+
+   union si_shader_part_key prolog_key;
+   memset(&prolog_key, 0, sizeof(prolog_key));
+   prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
+   prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
+
+   shader->prolog2 =
+      si_get_shader_part(sscreen, &sscreen->gs_prologs, PIPE_SHADER_GEOMETRY, true, &prolog_key,
+                         compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog");
+   return shader->prolog2 != NULL;
  }
  
  /**
   * Compute the PS prolog key, which contains all the information needed to
   * build the PS prolog function, and set related bits in shader->config.
   */
-void si_get_ps_prolog_key(struct si_shader *shader,
-                         union si_shader_part_key *key,
-                         bool separate_prolog)
+void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
+                          bool separate_prolog)
  {
-       struct si_shader_info *info = &shader->selector->info;
-
-       memset(key, 0, sizeof(*key));
-       key->ps_prolog.states = shader->key.part.ps.prolog;
-       key->ps_prolog.colors_read = info->colors_read;
-       key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
-       key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
-       key->ps_prolog.wqm = info->uses_derivatives &&
-               (key->ps_prolog.colors_read ||
-                key->ps_prolog.states.force_persp_sample_interp ||
-                key->ps_prolog.states.force_linear_sample_interp ||
-                key->ps_prolog.states.force_persp_center_interp ||
-                key->ps_prolog.states.force_linear_center_interp ||
-                key->ps_prolog.states.bc_optimize_for_persp ||
-                key->ps_prolog.states.bc_optimize_for_linear);
-       key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
-
-       if (info->colors_read) {
-               unsigned *color = shader->selector->color_attr_index;
-
-               if (shader->key.part.ps.prolog.color_two_side) {
-                       /* BCOLORs are stored after the last input. */
-                       key->ps_prolog.num_interp_inputs = info->num_inputs;
-                       key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
-                       if (separate_prolog)
-                               shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
-               }
-
-               for (unsigned i = 0; i < 2; i++) {
-                       unsigned interp = info->input_interpolate[color[i]];
-                       unsigned location = info->input_interpolate_loc[color[i]];
-
-                       if (!(info->colors_read & (0xf << i*4)))
-                               continue;
-
-                       key->ps_prolog.color_attr_index[i] = color[i];
-
-                       if (shader->key.part.ps.prolog.flatshade_colors &&
-                           interp == TGSI_INTERPOLATE_COLOR)
-                               interp = TGSI_INTERPOLATE_CONSTANT;
-
-                       switch (interp) {
-                       case TGSI_INTERPOLATE_CONSTANT:
-                               key->ps_prolog.color_interp_vgpr_index[i] = -1;
-                               break;
-                       case TGSI_INTERPOLATE_PERSPECTIVE:
-                       case TGSI_INTERPOLATE_COLOR:
-                               /* Force the interpolation location for colors here. */
-                               if (shader->key.part.ps.prolog.force_persp_sample_interp)
-                                       location = TGSI_INTERPOLATE_LOC_SAMPLE;
-                               if (shader->key.part.ps.prolog.force_persp_center_interp)
-                                       location = TGSI_INTERPOLATE_LOC_CENTER;
-
-                               switch (location) {
-                               case TGSI_INTERPOLATE_LOC_SAMPLE:
-                                       key->ps_prolog.color_interp_vgpr_index[i] = 0;
-                                       if (separate_prolog) {
-                                               shader->config.spi_ps_input_ena |=
-                                                       S_0286CC_PERSP_SAMPLE_ENA(1);
-                                       }
-                                       break;
-                               case TGSI_INTERPOLATE_LOC_CENTER:
-                                       key->ps_prolog.color_interp_vgpr_index[i] = 2;
-                                       if (separate_prolog) {
-                                               shader->config.spi_ps_input_ena |=
-                                                       S_0286CC_PERSP_CENTER_ENA(1);
-                                       }
-                                       break;
-                               case TGSI_INTERPOLATE_LOC_CENTROID:
-                                       key->ps_prolog.color_interp_vgpr_index[i] = 4;
-                                       if (separate_prolog) {
-                                               shader->config.spi_ps_input_ena |=
-                                                       S_0286CC_PERSP_CENTROID_ENA(1);
-                                       }
-                                       break;
-                               default:
-                                       assert(0);
-                               }
-                               break;
-                       case TGSI_INTERPOLATE_LINEAR:
-                               /* Force the interpolation location for colors here. */
-                               if (shader->key.part.ps.prolog.force_linear_sample_interp)
-                                       location = TGSI_INTERPOLATE_LOC_SAMPLE;
-                               if (shader->key.part.ps.prolog.force_linear_center_interp)
-                                       location = TGSI_INTERPOLATE_LOC_CENTER;
-
-                               /* The VGPR assignment for non-monolithic shaders
-                                * works because InitialPSInputAddr is set on the
-                                * main shader and PERSP_PULL_MODEL is never used.
-                                */
-                               switch (location) {
-                               case TGSI_INTERPOLATE_LOC_SAMPLE:
-                                       key->ps_prolog.color_interp_vgpr_index[i] =
-                                               separate_prolog ? 6 : 9;
-                                       if (separate_prolog) {
-                                               shader->config.spi_ps_input_ena |=
-                                                       S_0286CC_LINEAR_SAMPLE_ENA(1);
-                                       }
-                                       break;
-                               case TGSI_INTERPOLATE_LOC_CENTER:
-                                       key->ps_prolog.color_interp_vgpr_index[i] =
-                                               separate_prolog ? 8 : 11;
-                                       if (separate_prolog) {
-                                               shader->config.spi_ps_input_ena |=
-                                                       S_0286CC_LINEAR_CENTER_ENA(1);
-                                       }
-                                       break;
-                               case TGSI_INTERPOLATE_LOC_CENTROID:
-                                       key->ps_prolog.color_interp_vgpr_index[i] =
-                                               separate_prolog ? 10 : 13;
-                                       if (separate_prolog) {
-                                               shader->config.spi_ps_input_ena |=
-                                                       S_0286CC_LINEAR_CENTROID_ENA(1);
-                                       }
-                                       break;
-                               default:
-                                       assert(0);
-                               }
-                               break;
-                       default:
-                               assert(0);
-                       }
-               }
-       }
+   struct si_shader_info *info = &shader->selector->info;
+
+   memset(key, 0, sizeof(*key));
+   key->ps_prolog.states = shader->key.part.ps.prolog;
+   key->ps_prolog.colors_read = info->colors_read;
+   key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+   key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
+   key->ps_prolog.wqm =
+      info->uses_derivatives &&
+      (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
+       key->ps_prolog.states.force_linear_sample_interp ||
+       key->ps_prolog.states.force_persp_center_interp ||
+       key->ps_prolog.states.force_linear_center_interp ||
+       key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
+   key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
+
+   if (info->colors_read) {
+      unsigned *color = shader->selector->color_attr_index;
+
+      if (shader->key.part.ps.prolog.color_two_side) {
+         /* BCOLORs are stored after the last input. */
+         key->ps_prolog.num_interp_inputs = info->num_inputs;
+         key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
+         if (separate_prolog)
+            shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
+      }
+
+      for (unsigned i = 0; i < 2; i++) {
+         unsigned interp = info->input_interpolate[color[i]];
+         unsigned location = info->input_interpolate_loc[color[i]];
+
+         if (!(info->colors_read & (0xf << i * 4)))
+            continue;
+
+         key->ps_prolog.color_attr_index[i] = color[i];
+
+         if (shader->key.part.ps.prolog.flatshade_colors && interp == TGSI_INTERPOLATE_COLOR)
+            interp = TGSI_INTERPOLATE_CONSTANT;
+
+         switch (interp) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            key->ps_prolog.color_interp_vgpr_index[i] = -1;
+            break;
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+         case TGSI_INTERPOLATE_COLOR:
+            /* Force the interpolation location for colors here. */
+            if (shader->key.part.ps.prolog.force_persp_sample_interp)
+               location = TGSI_INTERPOLATE_LOC_SAMPLE;
+            if (shader->key.part.ps.prolog.force_persp_center_interp)
+               location = TGSI_INTERPOLATE_LOC_CENTER;
+
+            switch (location) {
+            case TGSI_INTERPOLATE_LOC_SAMPLE:
+               key->ps_prolog.color_interp_vgpr_index[i] = 0;
+               if (separate_prolog) {
+                  shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
+               }
+               break;
+            case TGSI_INTERPOLATE_LOC_CENTER:
+               key->ps_prolog.color_interp_vgpr_index[i] = 2;
+               if (separate_prolog) {
+                  shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
+               }
+               break;
+            case TGSI_INTERPOLATE_LOC_CENTROID:
+               key->ps_prolog.color_interp_vgpr_index[i] = 4;
+               if (separate_prolog) {
+                  shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
+               }
+               break;
+            default:
+               assert(0);
+            }
+            break;
+         case TGSI_INTERPOLATE_LINEAR:
+            /* Force the interpolation location for colors here. */
+            if (shader->key.part.ps.prolog.force_linear_sample_interp)
+               location = TGSI_INTERPOLATE_LOC_SAMPLE;
+            if (shader->key.part.ps.prolog.force_linear_center_interp)
+               location = TGSI_INTERPOLATE_LOC_CENTER;
+
+            /* The VGPR assignment for non-monolithic shaders
+             * works because InitialPSInputAddr is set on the
+             * main shader and PERSP_PULL_MODEL is never used.
+             */
+            switch (location) {
+            case TGSI_INTERPOLATE_LOC_SAMPLE:
+               key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9;
+               if (separate_prolog) {
+                  shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
+               }
+               break;
+            case TGSI_INTERPOLATE_LOC_CENTER:
+               key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11;
+               if (separate_prolog) {
+                  shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
+               }
+               break;
+            case TGSI_INTERPOLATE_LOC_CENTROID:
+               key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13;
+               if (separate_prolog) {
+                  shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
+               }
+               break;
+            default:
+               assert(0);
+            }
+            break;
+         default:
+            assert(0);
+         }
+      }
+   }
  }
  
  /**
@@ -2393,331 +2206,308 @@ void si_get_ps_prolog_key(struct si_shader *shader,
   */
  bool si_need_ps_prolog(const union si_shader_part_key *key)
  {
-       return key->ps_prolog.colors_read ||
-              key->ps_prolog.states.force_persp_sample_interp ||
-              key->ps_prolog.states.force_linear_sample_interp ||
-              key->ps_prolog.states.force_persp_center_interp ||
-              key->ps_prolog.states.force_linear_center_interp ||
-              key->ps_prolog.states.bc_optimize_for_persp ||
-              key->ps_prolog.states.bc_optimize_for_linear ||
-              key->ps_prolog.states.poly_stipple ||
-              key->ps_prolog.states.samplemask_log_ps_iter;
+   return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
+          key->ps_prolog.states.force_linear_sample_interp ||
+          key->ps_prolog.states.force_persp_center_interp ||
+          key->ps_prolog.states.force_linear_center_interp ||
+          key->ps_prolog.states.bc_optimize_for_persp ||
+          key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
+          key->ps_prolog.states.samplemask_log_ps_iter;
  }
  
  /**
   * Compute the PS epilog key, which contains all the information needed to
   * build the PS epilog function.
   */
-void si_get_ps_epilog_key(struct si_shader *shader,
-                         union si_shader_part_key *key)
+void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
  {
-       struct si_shader_info *info = &shader->selector->info;
-       memset(key, 0, sizeof(*key));
-       key->ps_epilog.colors_written = info->colors_written;
-       key->ps_epilog.writes_z = info->writes_z;
-       key->ps_epilog.writes_stencil = info->writes_stencil;
-       key->ps_epilog.writes_samplemask = info->writes_samplemask;
-       key->ps_epilog.states = shader->key.part.ps.epilog;
+   struct si_shader_info *info = &shader->selector->info;
+   memset(key, 0, sizeof(*key));
+   key->ps_epilog.colors_written = info->colors_written;
+   key->ps_epilog.writes_z = info->writes_z;
+   key->ps_epilog.writes_stencil = info->writes_stencil;
+   key->ps_epilog.writes_samplemask = info->writes_samplemask;
+   key->ps_epilog.states = shader->key.part.ps.epilog;
  }
  
  /**
   * Select and compile (or reuse) pixel shader parts (prolog & epilog).
   */
-static bool si_shader_select_ps_parts(struct si_screen *sscreen,
-                                     struct ac_llvm_compiler *compiler,
-                                     struct si_shader *shader,
-                                     struct pipe_debug_callback *debug)
+static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                                      struct si_shader *shader, struct pipe_debug_callback *debug)
  {
-       union si_shader_part_key prolog_key;
-       union si_shader_part_key epilog_key;
-
-       /* Get the prolog. */
-       si_get_ps_prolog_key(shader, &prolog_key, true);
-
-       /* The prolog is a no-op if these aren't set. */
-       if (si_need_ps_prolog(&prolog_key)) {
-               shader->prolog =
-                       si_get_shader_part(sscreen, &sscreen->ps_prologs,
-                                          PIPE_SHADER_FRAGMENT, true,
-                                          &prolog_key, compiler, debug,
-                                          si_llvm_build_ps_prolog,
-                                          "Fragment Shader Prolog");
-               if (!shader->prolog)
-                       return false;
-       }
-
-       /* Get the epilog. */
-       si_get_ps_epilog_key(shader, &epilog_key);
-
-       shader->epilog =
-               si_get_shader_part(sscreen, &sscreen->ps_epilogs,
-                                  PIPE_SHADER_FRAGMENT, false,
-                                  &epilog_key, compiler, debug,
-                                  si_llvm_build_ps_epilog,
-                                  "Fragment Shader Epilog");
-       if (!shader->epilog)
-               return false;
-
-       /* Enable POS_FIXED_PT if polygon stippling is enabled. */
-       if (shader->key.part.ps.prolog.poly_stipple) {
-               shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
-               assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
-       }
-
-       /* Set up the enable bits for per-sample shading if needed. */
-       if (shader->key.part.ps.prolog.force_persp_sample_interp &&
-           (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
-            G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
-               shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
-               shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
-               shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
-       }
-       if (shader->key.part.ps.prolog.force_linear_sample_interp &&
-           (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
-            G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
-               shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
-               shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
-               shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
-       }
-       if (shader->key.part.ps.prolog.force_persp_center_interp &&
-           (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
-            G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
-               shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
-               shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
-               shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
-       }
-       if (shader->key.part.ps.prolog.force_linear_center_interp &&
-           (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
-            G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
-               shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
-               shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
-               shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
-       }
-
-       /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
-       if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
-           !(shader->config.spi_ps_input_ena & 0xf)) {
-               shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
-               assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
-       }
-
-       /* At least one pair of interpolation weights must be enabled. */
-       if (!(shader->config.spi_ps_input_ena & 0x7f)) {
-               shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
-               assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
-       }
-
-       /* Samplemask fixup requires the sample ID. */
-       if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
-               shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
-               assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
-       }
-
-       /* The sample mask input is always enabled, because the API shader always
-        * passes it through to the epilog. Disable it here if it's unused.
-        */
-       if (!shader->key.part.ps.epilog.poly_line_smoothing &&
-           !shader->selector->info.reads_samplemask)
-               shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
-
-       return true;
+   union si_shader_part_key prolog_key;
+   union si_shader_part_key epilog_key;
+
+   /* Get the prolog. */
+   si_get_ps_prolog_key(shader, &prolog_key, true);
+
+   /* The prolog is a no-op if these aren't set. */
+   if (si_need_ps_prolog(&prolog_key)) {
+      shader->prolog =
+         si_get_shader_part(sscreen, &sscreen->ps_prologs, PIPE_SHADER_FRAGMENT, true, &prolog_key,
+                            compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog");
+      if (!shader->prolog)
+         return false;
+   }
+
+   /* Get the epilog. */
+   si_get_ps_epilog_key(shader, &epilog_key);
+
+   shader->epilog =
+      si_get_shader_part(sscreen, &sscreen->ps_epilogs, PIPE_SHADER_FRAGMENT, false, &epilog_key,
+                         compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog");
+   if (!shader->epilog)
+      return false;
+
+   /* Enable POS_FIXED_PT if polygon stippling is enabled. */
+   if (shader->key.part.ps.prolog.poly_stipple) {
+      shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
+      assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
+   }
+
+   /* Set up the enable bits for per-sample shading if needed. */
+   if (shader->key.part.ps.prolog.force_persp_sample_interp &&
+       (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
+        G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
+      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
+      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
+      shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
+   }
+   if (shader->key.part.ps.prolog.force_linear_sample_interp &&
+       (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
+        G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
+      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
+      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
+      shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
+   }
+   if (shader->key.part.ps.prolog.force_persp_center_interp &&
+       (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
+        G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
+      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
+      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
+      shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
+   }
+   if (shader->key.part.ps.prolog.force_linear_center_interp &&
+       (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
+        G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
+      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
+      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
+      shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
+   }
+
+   /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
+   if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
+       !(shader->config.spi_ps_input_ena & 0xf)) {
+      shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
+      assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
+   }
+
+   /* At least one pair of interpolation weights must be enabled. */
+   if (!(shader->config.spi_ps_input_ena & 0x7f)) {
+      shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
+      assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
+   }
+
+   /* Samplemask fixup requires the sample ID. */
+   if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
+      shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
+      assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
+   }
+
+   /* The sample mask input is always enabled, because the API shader always
+    * passes it through to the epilog. Disable it here if it's unused.
+    */
+   if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask)
+      shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
+
+   return true;
  }
  
-void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
-                                     unsigned *lds_size)
+void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
  {
-       /* If tessellation is all offchip and on-chip GS isn't used, this
-        * workaround is not needed.
-        */
-       return;
-
-       /* SPI barrier management bug:
-        *   Make sure we have at least 4k of LDS in use to avoid the bug.
-        *   It applies to workgroup sizes of more than one wavefront.
-        */
-       if (sscreen->info.family == CHIP_BONAIRE ||
-           sscreen->info.family == CHIP_KABINI)
-               *lds_size = MAX2(*lds_size, 8);
+   /* If tessellation is all offchip and on-chip GS isn't used, this
+    * workaround is not needed.
+    */
+   return;
+
+   /* SPI barrier management bug:
+    *   Make sure we have at least 4k of LDS in use to avoid the bug.
+    *   It applies to workgroup sizes of more than one wavefront.
+    */
+   if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
+      *lds_size = MAX2(*lds_size, 8);
  }
  
  void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
  {
-       unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
+   unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
  
-       shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
+   shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
  
-       if (shader->selector->type == PIPE_SHADER_COMPUTE &&
-           si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
-               si_multiwave_lds_size_workaround(sscreen,
-                                                &shader->config.lds_size);
-       }
+   if (shader->selector->type == PIPE_SHADER_COMPUTE &&
+       si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
+      si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
+   }
  }
  
-bool si_create_shader_variant(struct si_screen *sscreen,
-                             struct ac_llvm_compiler *compiler,
-                             struct si_shader *shader,
-                             struct pipe_debug_callback *debug)
+bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                              struct si_shader *shader, struct pipe_debug_callback *debug)
  {
-       struct si_shader_selector *sel = shader->selector;
-       struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
-
-       /* LS, ES, VS are compiled on demand if the main part hasn't been
-        * compiled for that stage.
-        *
-        * GS are compiled on demand if the main part hasn't been compiled
-        * for the chosen NGG-ness.
-        *
-        * Vertex shaders are compiled on demand when a vertex fetch
-        * workaround must be applied.
-        */
-       if (shader->is_monolithic) {
-               /* Monolithic shader (compiled as a whole, has many variants,
-                * may take a long time to compile).
-                */
-               if (!si_compile_shader(sscreen, compiler, shader, debug))
-                       return false;
-       } else {
-               /* The shader consists of several parts:
-                *
-                * - the middle part is the user shader, it has 1 variant only
-                *   and it was compiled during the creation of the shader
-                *   selector
-                * - the prolog part is inserted at the beginning
-                * - the epilog part is inserted at the end
-                *
-                * The prolog and epilog have many (but simple) variants.
-                *
-                * Starting with gfx9, geometry and tessellation control
-                * shaders also contain the prolog and user shader parts of
-                * the previous shader stage.
-                */
-
-               if (!mainp)
-                       return false;
-
-               /* Copy the compiled shader data over. */
-               shader->is_binary_shared = true;
-               shader->binary = mainp->binary;
-               shader->config = mainp->config;
-               shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
-               shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
-               shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
-               shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
-               memcpy(shader->info.vs_output_param_offset,
-                      mainp->info.vs_output_param_offset,
-                      sizeof(mainp->info.vs_output_param_offset));
-               shader->info.uses_instanceid = mainp->info.uses_instanceid;
-               shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
-               shader->info.nr_param_exports = mainp->info.nr_param_exports;
-
-               /* Select prologs and/or epilogs. */
-               switch (sel->type) {
-               case PIPE_SHADER_VERTEX:
-                       if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
-                               return false;
-                       break;
-               case PIPE_SHADER_TESS_CTRL:
-                       if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
-                               return false;
-                       break;
-               case PIPE_SHADER_TESS_EVAL:
-                       break;
-               case PIPE_SHADER_GEOMETRY:
-                       if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
-                               return false;
-                       break;
-               case PIPE_SHADER_FRAGMENT:
-                       if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
-                               return false;
-
-                       /* Make sure we have at least as many VGPRs as there
-                        * are allocated inputs.
-                        */
-                       shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
-                                                       shader->info.num_input_vgprs);
-                       break;
-               default:;
-               }
-
-               /* Update SGPR and VGPR counts. */
-               if (shader->prolog) {
-                       shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
-                                                       shader->prolog->config.num_sgprs);
-                       shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
-                                                       shader->prolog->config.num_vgprs);
-               }
-               if (shader->previous_stage) {
-                       shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
-                                                       shader->previous_stage->config.num_sgprs);
-                       shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
-                                                       shader->previous_stage->config.num_vgprs);
-                       shader->config.spilled_sgprs =
-                               MAX2(shader->config.spilled_sgprs,
-                                    shader->previous_stage->config.spilled_sgprs);
-                       shader->config.spilled_vgprs =
-                               MAX2(shader->config.spilled_vgprs,
-                                    shader->previous_stage->config.spilled_vgprs);
-                       shader->info.private_mem_vgprs =
-                               MAX2(shader->info.private_mem_vgprs,
-                                    shader->previous_stage->info.private_mem_vgprs);
-                       shader->config.scratch_bytes_per_wave =
-                               MAX2(shader->config.scratch_bytes_per_wave,
-                                    shader->previous_stage->config.scratch_bytes_per_wave);
-                       shader->info.uses_instanceid |=
-                               shader->previous_stage->info.uses_instanceid;
-               }
-               if (shader->prolog2) {
-                       shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
-                                                       shader->prolog2->config.num_sgprs);
-                       shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
-                                                       shader->prolog2->config.num_vgprs);
-               }
-               if (shader->epilog) {
-                       shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
-                                                       shader->epilog->config.num_sgprs);
-                       shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
-                                                       shader->epilog->config.num_vgprs);
-               }
-               si_calculate_max_simd_waves(shader);
-       }
-
-       if (shader->key.as_ngg) {
-               assert(!shader->key.as_es && !shader->key.as_ls);
-               gfx10_ngg_calculate_subgroup_info(shader);
-       } else if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) {
-               gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
-       }
-
-       si_fix_resource_usage(sscreen, shader);
-       si_shader_dump(sscreen, shader, debug, stderr, true);
-
-       /* Upload. */
-       if (!si_shader_binary_upload(sscreen, shader, 0)) {
-               fprintf(stderr, "LLVM failed to upload shader\n");
-               return false;
-       }
-
-       return true;
+   struct si_shader_selector *sel = shader->selector;
+   struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
+
+   /* LS, ES, VS are compiled on demand if the main part hasn't been
+    * compiled for that stage.
+    *
+    * GS are compiled on demand if the main part hasn't been compiled
+    * for the chosen NGG-ness.
+    *
+    * Vertex shaders are compiled on demand when a vertex fetch
+    * workaround must be applied.
+    */
+   if (shader->is_monolithic) {
+      /* Monolithic shader (compiled as a whole, has many variants,
+       * may take a long time to compile).
+       */
+      if (!si_compile_shader(sscreen, compiler, shader, debug))
+         return false;
+   } else {
+      /* The shader consists of several parts:
+       *
+       * - the middle part is the user shader, it has 1 variant only
+       *   and it was compiled during the creation of the shader
+       *   selector
+       * - the prolog part is inserted at the beginning
+       * - the epilog part is inserted at the end
+       *
+       * The prolog and epilog have many (but simple) variants.
+       *
+       * Starting with gfx9, geometry and tessellation control
+       * shaders also contain the prolog and user shader parts of
+       * the previous shader stage.
+       */
+
+      if (!mainp)
+         return false;
+
+      /* Copy the compiled shader data over. */
+      shader->is_binary_shared = true;
+      shader->binary = mainp->binary;
+      shader->config = mainp->config;
+      shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
+      shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
+      shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
+      shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
+      memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset,
+             sizeof(mainp->info.vs_output_param_offset));
+      shader->info.uses_instanceid = mainp->info.uses_instanceid;
+      shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
+      shader->info.nr_param_exports = mainp->info.nr_param_exports;
+
+      /* Select prologs and/or epilogs. */
+      switch (sel->type) {
+      case PIPE_SHADER_VERTEX:
+         if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
+            return false;
+         break;
+      case PIPE_SHADER_TESS_CTRL:
+         if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
+            return false;
+         break;
+      case PIPE_SHADER_TESS_EVAL:
+         break;
+      case PIPE_SHADER_GEOMETRY:
+         if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
+            return false;
+         break;
+      case PIPE_SHADER_FRAGMENT:
+         if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
+            return false;
+
+         /* Make sure we have at least as many VGPRs as there
+          * are allocated inputs.
+          */
+         shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
+         break;
+      default:;
+      }
+
+      /* Update SGPR and VGPR counts. */
+      if (shader->prolog) {
+         shader->config.num_sgprs =
+            MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
+         shader->config.num_vgprs =
+            MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
+      }
+      if (shader->previous_stage) {
+         shader->config.num_sgprs =
+            MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
+         shader->config.num_vgprs =
+            MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
+         shader->config.spilled_sgprs =
+            MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
+         shader->config.spilled_vgprs =
+            MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
+         shader->info.private_mem_vgprs =
+            MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
+         shader->config.scratch_bytes_per_wave =
+            MAX2(shader->config.scratch_bytes_per_wave,
+                 shader->previous_stage->config.scratch_bytes_per_wave);
+         shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
+      }
+      if (shader->prolog2) {
+         shader->config.num_sgprs =
+            MAX2(shader->config.num_sgprs, shader->prolog2->config.num_sgprs);
+         shader->config.num_vgprs =
+            MAX2(shader->config.num_vgprs, shader->prolog2->config.num_vgprs);
+      }
+      if (shader->epilog) {
+         shader->config.num_sgprs =
+            MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
+         shader->config.num_vgprs =
+            MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
+      }
+      si_calculate_max_simd_waves(shader);
+   }
+
+   if (shader->key.as_ngg) {
+      assert(!shader->key.as_es && !shader->key.as_ls);
+      gfx10_ngg_calculate_subgroup_info(shader);
+   } else if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) {
+      gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
+   }
+
+   si_fix_resource_usage(sscreen, shader);
+   si_shader_dump(sscreen, shader, debug, stderr, true);
+
+   /* Upload. */
+   if (!si_shader_binary_upload(sscreen, shader, 0)) {
+      fprintf(stderr, "LLVM failed to upload shader\n");
+      return false;
+   }
+
+   return true;
  }
  
  void si_shader_binary_clean(struct si_shader_binary *binary)
  {
-       free((void *)binary->elf_buffer);
-       binary->elf_buffer = NULL;
+   free((void *)binary->elf_buffer);
+   binary->elf_buffer = NULL;
  
-       free(binary->llvm_ir_string);
-       binary->llvm_ir_string = NULL;
+   free(binary->llvm_ir_string);
+   binary->llvm_ir_string = NULL;
  }
  
  void si_shader_destroy(struct si_shader *shader)
  {
-       if (shader->scratch_bo)
-               si_resource_reference(&shader->scratch_bo, NULL);
+   if (shader->scratch_bo)
+      si_resource_reference(&shader->scratch_bo, NULL);
  
-       si_resource_reference(&shader->bo, NULL);
+   si_resource_reference(&shader->bo, NULL);
  
-       if (!shader->is_binary_shared)
-               si_shader_binary_clean(&shader->binary);
+   if (!shader->is_binary_shared)
+      si_shader_binary_clean(&shader->binary);
  
-       free(shader->shader_log);
+   free(shader->shader_log);
  }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h

index ef571a5d684aa1a078ad866359f26e922dc84425..4b3bdf4a30ee5ccf1534680982da5858ab561684 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -131,14 +131,13 @@
  #ifndef SI_SHADER_H
  #define SI_SHADER_H
  
-#include "util/u_inlines.h"
-#include "util/u_live_shader_cache.h"
-#include "util/u_queue.h"
-#include "util/simple_mtx.h"
-
  #include "ac_binary.h"
  #include "ac_llvm_build.h"
  #include "ac_llvm_util.h"
+#include "util/simple_mtx.h"
+#include "util/u_inlines.h"
+#include "util/u_live_shader_cache.h"
+#include "util/u_queue.h"
  
  #include <stdio.h>
  
@@ -150,136 +149,139 @@ struct nir_shader;
  struct si_shader;
  struct si_context;
  
-#define SI_MAX_ATTRIBS         16
-#define SI_MAX_VS_OUTPUTS      40
+#define SI_MAX_ATTRIBS    16
+#define SI_MAX_VS_OUTPUTS 40
  
  /* Shader IO unique indices are supported for TGSI_SEMANTIC_GENERIC with an
   * index smaller than this.
   */
-#define SI_MAX_IO_GENERIC       32
+#define SI_MAX_IO_GENERIC 32
  
  #define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29))
  
  /* SGPR user data indices */
-enum {
-       SI_SGPR_RW_BUFFERS,  /* rings (& stream-out, VS only) */
-       SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
-       SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
-       SI_SGPR_SAMPLERS_AND_IMAGES,
-       SI_NUM_RESOURCE_SGPRS,
-
-       /* API VS, TES without GS, GS copy shader */
-       SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
-       SI_NUM_VS_STATE_RESOURCE_SGPRS,
-
-       /* all VS variants */
-       SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
-       SI_SGPR_START_INSTANCE,
-       SI_SGPR_DRAWID,
-       SI_VS_NUM_USER_SGPR,
-
-       SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
-
-       /* TES */
-       SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
-       SI_SGPR_TES_OFFCHIP_ADDR,
-       SI_TES_NUM_USER_SGPR,
-
-       /* GFX6-8: TCS only */
-       GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
-       GFX6_SGPR_TCS_OUT_OFFSETS,
-       GFX6_SGPR_TCS_OUT_LAYOUT,
-       GFX6_SGPR_TCS_IN_LAYOUT,
-       GFX6_TCS_NUM_USER_SGPR,
-
-       /* GFX9: Merged shaders. */
-       /* 2ND_CONST_AND_SHADER_BUFFERS is set in USER_DATA_ADDR_LO (SGPR0). */
-       /* 2ND_SAMPLERS_AND_IMAGES is set in USER_DATA_ADDR_HI (SGPR1). */
-       GFX9_MERGED_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
-
-       /* GFX9: Merged LS-HS (VS-TCS) only. */
-       GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR,
-       GFX9_SGPR_TCS_OUT_OFFSETS,
-       GFX9_SGPR_TCS_OUT_LAYOUT,
-       GFX9_TCS_NUM_USER_SGPR,
-
-       /* GS limits */
-       GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
-       GFX9_VSGS_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
-       GFX9_TESGS_NUM_USER_SGPR = SI_TES_NUM_USER_SGPR,
-       SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
-
-       /* PS only */
-       SI_SGPR_ALPHA_REF       = SI_NUM_RESOURCE_SGPRS,
-       SI_PS_NUM_USER_SGPR,
-
-       /* The value has to be 12, because the hw requires that descriptors
-        * are aligned to 4 SGPRs.
-        */
-       SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
+enum
+{
+   SI_SGPR_RW_BUFFERS, /* rings (& stream-out, VS only) */
+   SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
+   SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
+   SI_SGPR_SAMPLERS_AND_IMAGES,
+   SI_NUM_RESOURCE_SGPRS,
+
+   /* API VS, TES without GS, GS copy shader */
+   SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
+   SI_NUM_VS_STATE_RESOURCE_SGPRS,
+
+   /* all VS variants */
+   SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
+   SI_SGPR_START_INSTANCE,
+   SI_SGPR_DRAWID,
+   SI_VS_NUM_USER_SGPR,
+
+   SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
+
+   /* TES */
+   SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
+   SI_SGPR_TES_OFFCHIP_ADDR,
+   SI_TES_NUM_USER_SGPR,
+
+   /* GFX6-8: TCS only */
+   GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
+   GFX6_SGPR_TCS_OUT_OFFSETS,
+   GFX6_SGPR_TCS_OUT_LAYOUT,
+   GFX6_SGPR_TCS_IN_LAYOUT,
+   GFX6_TCS_NUM_USER_SGPR,
+
+   /* GFX9: Merged shaders. */
+   /* 2ND_CONST_AND_SHADER_BUFFERS is set in USER_DATA_ADDR_LO (SGPR0). */
+   /* 2ND_SAMPLERS_AND_IMAGES is set in USER_DATA_ADDR_HI (SGPR1). */
+   GFX9_MERGED_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
+
+   /* GFX9: Merged LS-HS (VS-TCS) only. */
+   GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR,
+   GFX9_SGPR_TCS_OUT_OFFSETS,
+   GFX9_SGPR_TCS_OUT_LAYOUT,
+   GFX9_TCS_NUM_USER_SGPR,
+
+   /* GS limits */
+   GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
+   GFX9_VSGS_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
+   GFX9_TESGS_NUM_USER_SGPR = SI_TES_NUM_USER_SGPR,
+   SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
+
+   /* PS only */
+   SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
+   SI_PS_NUM_USER_SGPR,
+
+   /* The value has to be 12, because the hw requires that descriptors
+    * are aligned to 4 SGPRs.
+    */
+   SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
  };
  
  /* LLVM function parameter indices */
-enum {
-       SI_NUM_RESOURCE_PARAMS = 4,
-
-       /* PS only parameters */
-       SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
-       SI_PARAM_PRIM_MASK,
-       SI_PARAM_PERSP_SAMPLE,
-       SI_PARAM_PERSP_CENTER,
-       SI_PARAM_PERSP_CENTROID,
-       SI_PARAM_PERSP_PULL_MODEL,
-       SI_PARAM_LINEAR_SAMPLE,
-       SI_PARAM_LINEAR_CENTER,
-       SI_PARAM_LINEAR_CENTROID,
-       SI_PARAM_LINE_STIPPLE_TEX,
-       SI_PARAM_POS_X_FLOAT,
-       SI_PARAM_POS_Y_FLOAT,
-       SI_PARAM_POS_Z_FLOAT,
-       SI_PARAM_POS_W_FLOAT,
-       SI_PARAM_FRONT_FACE,
-       SI_PARAM_ANCILLARY,
-       SI_PARAM_SAMPLE_COVERAGE,
-       SI_PARAM_POS_FIXED_PT,
-
-       SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
+enum
+{
+   SI_NUM_RESOURCE_PARAMS = 4,
+
+   /* PS only parameters */
+   SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
+   SI_PARAM_PRIM_MASK,
+   SI_PARAM_PERSP_SAMPLE,
+   SI_PARAM_PERSP_CENTER,
+   SI_PARAM_PERSP_CENTROID,
+   SI_PARAM_PERSP_PULL_MODEL,
+   SI_PARAM_LINEAR_SAMPLE,
+   SI_PARAM_LINEAR_CENTER,
+   SI_PARAM_LINEAR_CENTROID,
+   SI_PARAM_LINE_STIPPLE_TEX,
+   SI_PARAM_POS_X_FLOAT,
+   SI_PARAM_POS_Y_FLOAT,
+   SI_PARAM_POS_Z_FLOAT,
+   SI_PARAM_POS_W_FLOAT,
+   SI_PARAM_FRONT_FACE,
+   SI_PARAM_ANCILLARY,
+   SI_PARAM_SAMPLE_COVERAGE,
+   SI_PARAM_POS_FIXED_PT,
+
+   SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
  };
  
  /* Fields of driver-defined VS state SGPR. */
-#define S_VS_STATE_CLAMP_VERTEX_COLOR(x)       (((unsigned)(x) & 0x1) << 0)
-#define C_VS_STATE_CLAMP_VERTEX_COLOR          0xFFFFFFFE
-#define S_VS_STATE_INDEXED(x)                  (((unsigned)(x) & 0x1) << 1)
-#define C_VS_STATE_INDEXED                     0xFFFFFFFD
-#define S_VS_STATE_OUTPRIM(x)                  (((unsigned)(x) & 0x3) << 2)
-#define C_VS_STATE_OUTPRIM                     0xFFFFFFF3
-#define S_VS_STATE_PROVOKING_VTX_INDEX(x)      (((unsigned)(x) & 0x3) << 4)
-#define C_VS_STATE_PROVOKING_VTX_INDEX         0xFFFFFFCF
-#define S_VS_STATE_STREAMOUT_QUERY_ENABLED(x)  (((unsigned)(x) & 0x1) << 6)
-#define C_VS_STATE_STREAMOUT_QUERY_ENABLED     0xFFFFFFBF
-#define S_VS_STATE_SMALL_PRIM_PRECISION(x)     (((unsigned)(x) & 0xF) << 7)
-#define C_VS_STATE_SMALL_PRIM_PRECISION                0xFFFFF87F
-#define S_VS_STATE_LS_OUT_PATCH_SIZE(x)                (((unsigned)(x) & 0x1FFF) << 11)
-#define C_VS_STATE_LS_OUT_PATCH_SIZE           0xFF0007FF
-#define S_VS_STATE_LS_OUT_VERTEX_SIZE(x)       (((unsigned)(x) & 0xFF) << 24)
-#define C_VS_STATE_LS_OUT_VERTEX_SIZE          0x00FFFFFF
-
-enum {
-       /* Use a property enum that CS wouldn't use. */
-       TGSI_PROPERTY_CS_LOCAL_SIZE = TGSI_PROPERTY_FS_COORD_ORIGIN,
-
-       /* These represent the number of SGPRs the shader uses. */
-       SI_VS_BLIT_SGPRS_POS = 3,
-       SI_VS_BLIT_SGPRS_POS_COLOR = 7,
-       SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
+#define S_VS_STATE_CLAMP_VERTEX_COLOR(x)      (((unsigned)(x)&0x1) << 0)
+#define C_VS_STATE_CLAMP_VERTEX_COLOR         0xFFFFFFFE
+#define S_VS_STATE_INDEXED(x)                 (((unsigned)(x)&0x1) << 1)
+#define C_VS_STATE_INDEXED                    0xFFFFFFFD
+#define S_VS_STATE_OUTPRIM(x)                 (((unsigned)(x)&0x3) << 2)
+#define C_VS_STATE_OUTPRIM                    0xFFFFFFF3
+#define S_VS_STATE_PROVOKING_VTX_INDEX(x)     (((unsigned)(x)&0x3) << 4)
+#define C_VS_STATE_PROVOKING_VTX_INDEX        0xFFFFFFCF
+#define S_VS_STATE_STREAMOUT_QUERY_ENABLED(x) (((unsigned)(x)&0x1) << 6)
+#define C_VS_STATE_STREAMOUT_QUERY_ENABLED    0xFFFFFFBF
+#define S_VS_STATE_SMALL_PRIM_PRECISION(x)    (((unsigned)(x)&0xF) << 7)
+#define C_VS_STATE_SMALL_PRIM_PRECISION       0xFFFFF87F
+#define S_VS_STATE_LS_OUT_PATCH_SIZE(x)       (((unsigned)(x)&0x1FFF) << 11)
+#define C_VS_STATE_LS_OUT_PATCH_SIZE          0xFF0007FF
+#define S_VS_STATE_LS_OUT_VERTEX_SIZE(x)      (((unsigned)(x)&0xFF) << 24)
+#define C_VS_STATE_LS_OUT_VERTEX_SIZE         0x00FFFFFF
+
+enum
+{
+   /* Use a property enum that CS wouldn't use. */
+   TGSI_PROPERTY_CS_LOCAL_SIZE = TGSI_PROPERTY_FS_COORD_ORIGIN,
+
+   /* These represent the number of SGPRs the shader uses. */
+   SI_VS_BLIT_SGPRS_POS = 3,
+   SI_VS_BLIT_SGPRS_POS_COLOR = 7,
+   SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
  };
  
-#define SI_NGG_CULL_VIEW_SMALLPRIMS            (1 << 0) /* view.xy + small prims */
-#define SI_NGG_CULL_BACK_FACE                  (1 << 1) /* back faces */
-#define SI_NGG_CULL_FRONT_FACE                 (1 << 2) /* front faces */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST    (1 << 3) /* GS fast launch: triangles */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP   (1 << 4) /* GS fast launch: triangle strip */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL         (0x3 << 3) /* GS fast launch (both prim types) */
+#define SI_NGG_CULL_VIEW_SMALLPRIMS          (1 << 0)   /* view.xy + small prims */
+#define SI_NGG_CULL_BACK_FACE                (1 << 1)   /* back faces */
+#define SI_NGG_CULL_FRONT_FACE               (1 << 2)   /* front faces */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST  (1 << 3)   /* GS fast launch: triangles */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4)   /* GS fast launch: triangle strip */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL       (0x3 << 3) /* GS fast launch (both prim types) */
  
  /**
   * For VS shader keys, describe any fixups required for vertex fetch.
@@ -292,190 +294,190 @@ enum {
   * buffer_load_format_xyzw).
   */
  union si_vs_fix_fetch {
-       struct {
-               uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */
-               uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
-               uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */
-               uint8_t reverse : 1; /* reverse XYZ channels */
-       } u;
-       uint8_t bits;
+   struct {
+      uint8_t log_size : 2;        /* 1, 2, 4, 8 or bytes per channel */
+      uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
+      uint8_t format : 3;          /* AC_FETCH_FORMAT_xxx */
+      uint8_t reverse : 1;         /* reverse XYZ channels */
+   } u;
+   uint8_t bits;
  };
  
  struct si_shader;
  
  /* State of the context creating the shader object. */
  struct si_compiler_ctx_state {
-       /* Should only be used by si_init_shader_selector_async and
-        * si_build_shader_variant if thread_index == -1 (non-threaded). */
-       struct ac_llvm_compiler         *compiler;
+   /* Should only be used by si_init_shader_selector_async and
+    * si_build_shader_variant if thread_index == -1 (non-threaded). */
+   struct ac_llvm_compiler *compiler;
  
-       /* Used if thread_index == -1 or if debug.async is true. */
-       struct pipe_debug_callback      debug;
+   /* Used if thread_index == -1 or if debug.async is true. */
+   struct pipe_debug_callback debug;
  
-       /* Used for creating the log string for gallium/ddebug. */
-       bool                            is_debug_context;
+   /* Used for creating the log string for gallium/ddebug. */
+   bool is_debug_context;
  };
  
  struct si_shader_info {
-       ubyte num_inputs;
-       ubyte num_outputs;
-       ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
-       ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
-       ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
-       ubyte input_interpolate_loc[PIPE_MAX_SHADER_INPUTS];
-       ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
-       ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
-       ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
-       ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
-       ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
-
-       ubyte processor;
-
-       int constbuf0_num_slots;
-       unsigned const_buffers_declared; /**< bitmask of declared const buffers */
-       unsigned samplers_declared; /**< bitmask of declared samplers */
-       ubyte num_stream_output_components[4];
-
-       uint num_memory_instructions; /**< sampler, buffer, and image instructions */
-
-       /**
-        * If a tessellation control shader reads outputs, this describes which ones.
-        */
-       bool reads_pervertex_outputs;
-       bool reads_perpatch_outputs;
-       bool reads_tessfactor_outputs;
-
-       ubyte colors_read; /**< which color components are read by the FS */
-       ubyte colors_written;
-       bool reads_samplemask; /**< does fragment shader read sample mask? */
-       bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
-       bool writes_z;  /**< does fragment shader write Z value? */
-       bool writes_stencil; /**< does fragment shader write stencil value? */
-       bool writes_samplemask; /**< does fragment shader write sample mask? */
-       bool writes_edgeflag; /**< vertex shader outputs edgeflag */
-       bool uses_kill;  /**< KILL or KILL_IF instruction used? */
-       bool uses_persp_center;
-       bool uses_persp_centroid;
-       bool uses_persp_sample;
-       bool uses_linear_center;
-       bool uses_linear_centroid;
-       bool uses_linear_sample;
-       bool uses_persp_opcode_interp_sample;
-       bool uses_linear_opcode_interp_sample;
-       bool uses_instanceid;
-       bool uses_vertexid;
-       bool uses_vertexid_nobase;
-       bool uses_basevertex;
-       bool uses_drawid;
-       bool uses_primid;
-       bool uses_frontface;
-       bool uses_invocationid;
-       bool uses_thread_id[3];
-       bool uses_block_id[3];
-       bool uses_block_size;
-       bool uses_grid_size;
-       bool uses_subgroup_info;
-       bool writes_position;
-       bool writes_psize;
-       bool writes_clipvertex;
-       bool writes_primid;
-       bool writes_viewport_index;
-       bool writes_layer;
-       bool writes_memory; /**< contains stores or atomics to buffers or images */
-       bool uses_derivatives;
-       bool uses_bindless_samplers;
-       bool uses_bindless_images;
-       bool uses_fbfetch;
-       unsigned clipdist_writemask;
-       unsigned culldist_writemask;
-       unsigned num_written_culldistance;
-       unsigned num_written_clipdistance;
-
-       unsigned images_declared; /**< bitmask of declared images */
-       unsigned msaa_images_declared; /**< bitmask of declared MSAA images */
-       unsigned shader_buffers_declared; /**< bitmask of declared shader buffers */
-
-       unsigned properties[TGSI_PROPERTY_COUNT]; /* index with TGSI_PROPERTY_ */
-
-       /** Whether all codepaths write tess factors in all invocations. */
-       bool tessfactors_are_def_in_all_invocs;
+   ubyte num_inputs;
+   ubyte num_outputs;
+   ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
+   ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_interpolate_loc[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
+   ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
+   ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
+   ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
+   ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
+
+   ubyte processor;
+
+   int constbuf0_num_slots;
+   unsigned const_buffers_declared; /**< bitmask of declared const buffers */
+   unsigned samplers_declared;      /**< bitmask of declared samplers */
+   ubyte num_stream_output_components[4];
+
+   uint num_memory_instructions; /**< sampler, buffer, and image instructions */
+
+   /**
+    * If a tessellation control shader reads outputs, this describes which ones.
+    */
+   bool reads_pervertex_outputs;
+   bool reads_perpatch_outputs;
+   bool reads_tessfactor_outputs;
+
+   ubyte colors_read; /**< which color components are read by the FS */
+   ubyte colors_written;
+   bool reads_samplemask;   /**< does fragment shader read sample mask? */
+   bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
+   bool writes_z;           /**< does fragment shader write Z value? */
+   bool writes_stencil;     /**< does fragment shader write stencil value? */
+   bool writes_samplemask;  /**< does fragment shader write sample mask? */
+   bool writes_edgeflag;    /**< vertex shader outputs edgeflag */
+   bool uses_kill;          /**< KILL or KILL_IF instruction used? */
+   bool uses_persp_center;
+   bool uses_persp_centroid;
+   bool uses_persp_sample;
+   bool uses_linear_center;
+   bool uses_linear_centroid;
+   bool uses_linear_sample;
+   bool uses_persp_opcode_interp_sample;
+   bool uses_linear_opcode_interp_sample;
+   bool uses_instanceid;
+   bool uses_vertexid;
+   bool uses_vertexid_nobase;
+   bool uses_basevertex;
+   bool uses_drawid;
+   bool uses_primid;
+   bool uses_frontface;
+   bool uses_invocationid;
+   bool uses_thread_id[3];
+   bool uses_block_id[3];
+   bool uses_block_size;
+   bool uses_grid_size;
+   bool uses_subgroup_info;
+   bool writes_position;
+   bool writes_psize;
+   bool writes_clipvertex;
+   bool writes_primid;
+   bool writes_viewport_index;
+   bool writes_layer;
+   bool writes_memory; /**< contains stores or atomics to buffers or images */
+   bool uses_derivatives;
+   bool uses_bindless_samplers;
+   bool uses_bindless_images;
+   bool uses_fbfetch;
+   unsigned clipdist_writemask;
+   unsigned culldist_writemask;
+   unsigned num_written_culldistance;
+   unsigned num_written_clipdistance;
+
+   unsigned images_declared;         /**< bitmask of declared images */
+   unsigned msaa_images_declared;    /**< bitmask of declared MSAA images */
+   unsigned shader_buffers_declared; /**< bitmask of declared shader buffers */
+
+   unsigned properties[TGSI_PROPERTY_COUNT]; /* index with TGSI_PROPERTY_ */
+
+   /** Whether all codepaths write tess factors in all invocations. */
+   bool tessfactors_are_def_in_all_invocs;
  };
  
  /* A shader selector is a gallium CSO and contains shader variants and
   * binaries for one NIR program. This can be shared by multiple contexts.
   */
  struct si_shader_selector {
-       struct util_live_shader base;
-       struct si_screen        *screen;
-       struct util_queue_fence ready;
-       struct si_compiler_ctx_state compiler_ctx_state;
-
-       simple_mtx_t            mutex;
-       struct si_shader        *first_variant; /* immutable after the first variant */
-       struct si_shader        *last_variant; /* mutable */
-
-       /* The compiled NIR shader without a prolog and/or epilog (not
-        * uploaded to a buffer object).
-        */
-       struct si_shader        *main_shader_part;
-       struct si_shader        *main_shader_part_ls; /* as_ls is set in the key */
-       struct si_shader        *main_shader_part_es; /* as_es is set in the key */
-       struct si_shader        *main_shader_part_ngg; /* as_ngg is set in the key */
-       struct si_shader        *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */
-
-       struct si_shader        *gs_copy_shader;
-
-       struct nir_shader       *nir;
-       void                    *nir_binary;
-       unsigned                nir_size;
-
-       struct pipe_stream_output_info  so;
-       struct si_shader_info           info;
-
-       /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
-       enum pipe_shader_type type;
-       bool            vs_needs_prolog;
-       bool            prim_discard_cs_allowed;
-       bool            ngg_culling_allowed;
-       unsigned        num_vs_inputs;
-       unsigned        num_vbos_in_user_sgprs;
-       unsigned        pa_cl_vs_out_cntl;
-       ubyte           clipdist_mask;
-       ubyte           culldist_mask;
-       unsigned        rast_prim;
-
-       /* ES parameters. */
-       unsigned        esgs_itemsize; /* vertex stride */
-       unsigned        lshs_vertex_stride;
-
-       /* GS parameters. */
-       unsigned        gs_input_verts_per_prim;
-       unsigned        gs_output_prim;
-       unsigned        gs_max_out_vertices;
-       unsigned        gs_num_invocations;
-       unsigned        max_gs_stream; /* count - 1 */
-       unsigned        gsvs_vertex_size;
-       unsigned        max_gsvs_emit_size;
-       unsigned        enabled_streamout_buffer_mask;
-       bool            tess_turns_off_ngg;
-
-       /* PS parameters. */
-       unsigned        color_attr_index[2];
-       unsigned        db_shader_control;
-       /* Set 0xf or 0x0 (4 bits) per each written output.
-        * ANDed with spi_shader_col_format.
-        */
-       unsigned        colors_written_4bit;
-
-       uint64_t        outputs_written_before_ps; /* "get_unique_index" bits */
-       uint64_t        outputs_written;        /* "get_unique_index" bits */
-       uint32_t        patch_outputs_written;  /* "get_unique_index_patch" bits */
-
-       uint64_t        inputs_read;            /* "get_unique_index" bits */
-
-       /* bitmasks of used descriptor slots */
-       uint32_t        active_const_and_shader_buffers;
-       uint64_t        active_samplers_and_images;
+   struct util_live_shader base;
+   struct si_screen *screen;
+   struct util_queue_fence ready;
+   struct si_compiler_ctx_state compiler_ctx_state;
+
+   simple_mtx_t mutex;
+   struct si_shader *first_variant; /* immutable after the first variant */
+   struct si_shader *last_variant;  /* mutable */
+
+   /* The compiled NIR shader without a prolog and/or epilog (not
+    * uploaded to a buffer object).
+    */
+   struct si_shader *main_shader_part;
+   struct si_shader *main_shader_part_ls;     /* as_ls is set in the key */
+   struct si_shader *main_shader_part_es;     /* as_es is set in the key */
+   struct si_shader *main_shader_part_ngg;    /* as_ngg is set in the key */
+   struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */
+
+   struct si_shader *gs_copy_shader;
+
+   struct nir_shader *nir;
+   void *nir_binary;
+   unsigned nir_size;
+
+   struct pipe_stream_output_info so;
+   struct si_shader_info info;
+
+   /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
+   enum pipe_shader_type type;
+   bool vs_needs_prolog;
+   bool prim_discard_cs_allowed;
+   bool ngg_culling_allowed;
+   unsigned num_vs_inputs;
+   unsigned num_vbos_in_user_sgprs;
+   unsigned pa_cl_vs_out_cntl;
+   ubyte clipdist_mask;
+   ubyte culldist_mask;
+   unsigned rast_prim;
+
+   /* ES parameters. */
+   unsigned esgs_itemsize; /* vertex stride */
+   unsigned lshs_vertex_stride;
+
+   /* GS parameters. */
+   unsigned gs_input_verts_per_prim;
+   unsigned gs_output_prim;
+   unsigned gs_max_out_vertices;
+   unsigned gs_num_invocations;
+   unsigned max_gs_stream; /* count - 1 */
+   unsigned gsvs_vertex_size;
+   unsigned max_gsvs_emit_size;
+   unsigned enabled_streamout_buffer_mask;
+   bool tess_turns_off_ngg;
+
+   /* PS parameters. */
+   unsigned color_attr_index[2];
+   unsigned db_shader_control;
+   /* Set 0xf or 0x0 (4 bits) per each written output.
+    * ANDed with spi_shader_col_format.
+    */
+   unsigned colors_written_4bit;
+
+   uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
+   uint64_t outputs_written;           /* "get_unique_index" bits */
+   uint32_t patch_outputs_written;     /* "get_unique_index_patch" bits */
+
+   uint64_t inputs_read; /* "get_unique_index" bits */
+
+   /* bitmasks of used descriptor slots */
+   uint32_t active_const_and_shader_buffers;
+   uint64_t active_samplers_and_images;
  };
  
  /* Valid shader configurations:
@@ -506,184 +508,184 @@ struct si_shader_selector {
  
  /* Common VS bits between the shader key and the prolog key. */
  struct si_vs_prolog_bits {
-       /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
-        *   divisor is 0.
-        * - If "is_one" has a bit set, the instance divisor is 1.
-        * - If "is_fetched" has a bit set, the instance divisor will be loaded
-        *   from the constant buffer.
-        */
-       uint16_t        instance_divisor_is_one;     /* bitmask of inputs */
-       uint16_t        instance_divisor_is_fetched; /* bitmask of inputs */
-       unsigned        ls_vgpr_fix:1;
-       unsigned        unpack_instance_id_from_vertex_id:1;
+   /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
+    *   divisor is 0.
+    * - If "is_one" has a bit set, the instance divisor is 1.
+    * - If "is_fetched" has a bit set, the instance divisor will be loaded
+    *   from the constant buffer.
+    */
+   uint16_t instance_divisor_is_one;     /* bitmask of inputs */
+   uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
+   unsigned ls_vgpr_fix : 1;
+   unsigned unpack_instance_id_from_vertex_id : 1;
  };
  
  /* Common TCS bits between the shader key and the epilog key. */
  struct si_tcs_epilog_bits {
-       unsigned        prim_mode:3;
-       unsigned        invoc0_tess_factors_are_def:1;
-       unsigned        tes_reads_tess_factors:1;
+   unsigned prim_mode : 3;
+   unsigned invoc0_tess_factors_are_def : 1;
+   unsigned tes_reads_tess_factors : 1;
  };
  
  struct si_gs_prolog_bits {
-       unsigned        tri_strip_adj_fix:1;
-       unsigned        gfx9_prev_is_vs:1;
+   unsigned tri_strip_adj_fix : 1;
+   unsigned gfx9_prev_is_vs : 1;
  };
  
  /* Common PS bits between the shader key and the prolog key. */
  struct si_ps_prolog_bits {
-       unsigned        color_two_side:1;
-       unsigned        flatshade_colors:1;
-       unsigned        poly_stipple:1;
-       unsigned        force_persp_sample_interp:1;
-       unsigned        force_linear_sample_interp:1;
-       unsigned        force_persp_center_interp:1;
-       unsigned        force_linear_center_interp:1;
-       unsigned        bc_optimize_for_persp:1;
-       unsigned        bc_optimize_for_linear:1;
-       unsigned        samplemask_log_ps_iter:3;
+   unsigned color_two_side : 1;
+   unsigned flatshade_colors : 1;
+   unsigned poly_stipple : 1;
+   unsigned force_persp_sample_interp : 1;
+   unsigned force_linear_sample_interp : 1;
+   unsigned force_persp_center_interp : 1;
+   unsigned force_linear_center_interp : 1;
+   unsigned bc_optimize_for_persp : 1;
+   unsigned bc_optimize_for_linear : 1;
+   unsigned samplemask_log_ps_iter : 3;
  };
  
  /* Common PS bits between the shader key and the epilog key. */
  struct si_ps_epilog_bits {
-       unsigned        spi_shader_col_format;
-       unsigned        color_is_int8:8;
-       unsigned        color_is_int10:8;
-       unsigned        last_cbuf:3;
-       unsigned        alpha_func:3;
-       unsigned        alpha_to_one:1;
-       unsigned        poly_line_smoothing:1;
-       unsigned        clamp_color:1;
+   unsigned spi_shader_col_format;
+   unsigned color_is_int8 : 8;
+   unsigned color_is_int10 : 8;
+   unsigned last_cbuf : 3;
+   unsigned alpha_func : 3;
+   unsigned alpha_to_one : 1;
+   unsigned poly_line_smoothing : 1;
+   unsigned clamp_color : 1;
  };
  
  union si_shader_part_key {
-       struct {
-               struct si_vs_prolog_bits states;
-               unsigned        num_input_sgprs:6;
-               /* For merged stages such as LS-HS, HS input VGPRs are first. */
-               unsigned        num_merged_next_stage_vgprs:3;
-               unsigned        num_inputs:5;
-               unsigned        as_ls:1;
-               unsigned        as_es:1;
-               unsigned        as_ngg:1;
-               unsigned        as_prim_discard_cs:1;
-               unsigned        has_ngg_cull_inputs:1; /* from the NGG cull shader */
-               unsigned        gs_fast_launch_tri_list:1; /* for NGG culling */
-               unsigned        gs_fast_launch_tri_strip:1; /* for NGG culling */
-               /* Prologs for monolithic shaders shouldn't set EXEC. */
-               unsigned        is_monolithic:1;
-       } vs_prolog;
-       struct {
-               struct si_tcs_epilog_bits states;
-       } tcs_epilog;
-       struct {
-               struct si_gs_prolog_bits states;
-               /* Prologs of monolithic shaders shouldn't set EXEC. */
-               unsigned        is_monolithic:1;
-               unsigned        as_ngg:1;
-       } gs_prolog;
-       struct {
-               struct si_ps_prolog_bits states;
-               unsigned        num_input_sgprs:6;
-               unsigned        num_input_vgprs:5;
-               /* Color interpolation and two-side color selection. */
-               unsigned        colors_read:8; /* color input components read */
-               unsigned        num_interp_inputs:5; /* BCOLOR is at this location */
-               unsigned        face_vgpr_index:5;
-               unsigned        ancillary_vgpr_index:5;
-               unsigned        wqm:1;
-               char            color_attr_index[2];
-               signed char     color_interp_vgpr_index[2]; /* -1 == constant */
-       } ps_prolog;
-       struct {
-               struct si_ps_epilog_bits states;
-               unsigned        colors_written:8;
-               unsigned        writes_z:1;
-               unsigned        writes_stencil:1;
-               unsigned        writes_samplemask:1;
-       } ps_epilog;
+   struct {
+      struct si_vs_prolog_bits states;
+      unsigned num_input_sgprs : 6;
+      /* For merged stages such as LS-HS, HS input VGPRs are first. */
+      unsigned num_merged_next_stage_vgprs : 3;
+      unsigned num_inputs : 5;
+      unsigned as_ls : 1;
+      unsigned as_es : 1;
+      unsigned as_ngg : 1;
+      unsigned as_prim_discard_cs : 1;
+      unsigned has_ngg_cull_inputs : 1;      /* from the NGG cull shader */
+      unsigned gs_fast_launch_tri_list : 1;  /* for NGG culling */
+      unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
+      /* Prologs for monolithic shaders shouldn't set EXEC. */
+      unsigned is_monolithic : 1;
+   } vs_prolog;
+   struct {
+      struct si_tcs_epilog_bits states;
+   } tcs_epilog;
+   struct {
+      struct si_gs_prolog_bits states;
+      /* Prologs of monolithic shaders shouldn't set EXEC. */
+      unsigned is_monolithic : 1;
+      unsigned as_ngg : 1;
+   } gs_prolog;
+   struct {
+      struct si_ps_prolog_bits states;
+      unsigned num_input_sgprs : 6;
+      unsigned num_input_vgprs : 5;
+      /* Color interpolation and two-side color selection. */
+      unsigned colors_read : 8;       /* color input components read */
+      unsigned num_interp_inputs : 5; /* BCOLOR is at this location */
+      unsigned face_vgpr_index : 5;
+      unsigned ancillary_vgpr_index : 5;
+      unsigned wqm : 1;
+      char color_attr_index[2];
+      signed char color_interp_vgpr_index[2]; /* -1 == constant */
+   } ps_prolog;
+   struct {
+      struct si_ps_epilog_bits states;
+      unsigned colors_written : 8;
+      unsigned writes_z : 1;
+      unsigned writes_stencil : 1;
+      unsigned writes_samplemask : 1;
+   } ps_epilog;
  };
  
  struct si_shader_key {
-       /* Prolog and epilog flags. */
-       union {
-               struct {
-                       struct si_vs_prolog_bits prolog;
-               } vs;
-               struct {
-                       struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */
-                       struct si_shader_selector *ls;   /* for merged LS-HS */
-                       struct si_tcs_epilog_bits epilog;
-               } tcs; /* tessellation control shader */
-               struct {
-                       struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */
-                       struct si_shader_selector *es;   /* for merged ES-GS */
-                       struct si_gs_prolog_bits prolog;
-               } gs;
-               struct {
-                       struct si_ps_prolog_bits prolog;
-                       struct si_ps_epilog_bits epilog;
-               } ps;
-       } part;
-
-       /* These three are initially set according to the NEXT_SHADER property,
-        * or guessed if the property doesn't seem correct.
-        */
-       unsigned as_es:1; /* export shader, which precedes GS */
-       unsigned as_ls:1; /* local shader, which precedes TCS */
-       unsigned as_ngg:1; /* VS, TES, or GS compiled as NGG primitive shader */
-
-       /* Flags for monolithic compilation only. */
-       struct {
-               /* Whether fetch should be opencoded according to vs_fix_fetch.
-                * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
-                * with minimal fixups is used. */
-               uint16_t vs_fetch_opencode;
-               union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
-
-               union {
-                       uint64_t        ff_tcs_inputs_to_copy; /* for fixed-func TCS */
-                       /* When PS needs PrimID and GS is disabled. */
-                       unsigned        vs_export_prim_id:1;
-                       struct {
-                               unsigned interpolate_at_sample_force_center:1;
-                               unsigned fbfetch_msaa:1;
-                               unsigned fbfetch_is_1D:1;
-                               unsigned fbfetch_layered:1;
-                       } ps;
-               } u;
-       } mono;
-
-       /* Optimization flags for asynchronous compilation only. */
-       struct {
-               /* For HW VS (it can be VS, TES, GS) */
-               uint64_t        kill_outputs; /* "get_unique_index" bits */
-               unsigned        clip_disable:1;
-
-               /* For NGG VS and TES. */
-               unsigned        ngg_culling:5; /* SI_NGG_CULL_* */
-
-               /* For shaders where monolithic variants have better code.
-                *
-                * This is a flag that has no effect on code generation,
-                * but forces monolithic shaders to be used as soon as
-                * possible, because it's in the "opt" group.
-                */
-               unsigned        prefer_mono:1;
-
-               /* Primitive discard compute shader. */
-               unsigned        vs_as_prim_discard_cs:1;
-               unsigned        cs_prim_type:4;
-               unsigned        cs_indexed:1;
-               unsigned        cs_instancing:1;
-               unsigned        cs_primitive_restart:1;
-               unsigned        cs_provoking_vertex_first:1;
-               unsigned        cs_need_correct_orientation:1;
-               unsigned        cs_cull_front:1;
-               unsigned        cs_cull_back:1;
-               unsigned        cs_cull_z:1;
-               unsigned        cs_halfz_clip_space:1;
-       } opt;
+   /* Prolog and epilog flags. */
+   union {
+      struct {
+         struct si_vs_prolog_bits prolog;
+      } vs;
+      struct {
+         struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */
+         struct si_shader_selector *ls;      /* for merged LS-HS */
+         struct si_tcs_epilog_bits epilog;
+      } tcs; /* tessellation control shader */
+      struct {
+         struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */
+         struct si_shader_selector *es;      /* for merged ES-GS */
+         struct si_gs_prolog_bits prolog;
+      } gs;
+      struct {
+         struct si_ps_prolog_bits prolog;
+         struct si_ps_epilog_bits epilog;
+      } ps;
+   } part;
+
+   /* These three are initially set according to the NEXT_SHADER property,
+    * or guessed if the property doesn't seem correct.
+    */
+   unsigned as_es : 1;  /* export shader, which precedes GS */
+   unsigned as_ls : 1;  /* local shader, which precedes TCS */
+   unsigned as_ngg : 1; /* VS, TES, or GS compiled as NGG primitive shader */
+
+   /* Flags for monolithic compilation only. */
+   struct {
+      /* Whether fetch should be opencoded according to vs_fix_fetch.
+       * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
+       * with minimal fixups is used. */
+      uint16_t vs_fetch_opencode;
+      union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
+
+      union {
+         uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */
+         /* When PS needs PrimID and GS is disabled. */
+         unsigned vs_export_prim_id : 1;
+         struct {
+            unsigned interpolate_at_sample_force_center : 1;
+            unsigned fbfetch_msaa : 1;
+            unsigned fbfetch_is_1D : 1;
+            unsigned fbfetch_layered : 1;
+         } ps;
+      } u;
+   } mono;
+
+   /* Optimization flags for asynchronous compilation only. */
+   struct {
+      /* For HW VS (it can be VS, TES, GS) */
+      uint64_t kill_outputs; /* "get_unique_index" bits */
+      unsigned clip_disable : 1;
+
+      /* For NGG VS and TES. */
+      unsigned ngg_culling : 5; /* SI_NGG_CULL_* */
+
+      /* For shaders where monolithic variants have better code.
+       *
+       * This is a flag that has no effect on code generation,
+       * but forces monolithic shaders to be used as soon as
+       * possible, because it's in the "opt" group.
+       */
+      unsigned prefer_mono : 1;
+
+      /* Primitive discard compute shader. */
+      unsigned vs_as_prim_discard_cs : 1;
+      unsigned cs_prim_type : 4;
+      unsigned cs_indexed : 1;
+      unsigned cs_instancing : 1;
+      unsigned cs_primitive_restart : 1;
+      unsigned cs_provoking_vertex_first : 1;
+      unsigned cs_need_correct_orientation : 1;
+      unsigned cs_cull_front : 1;
+      unsigned cs_cull_back : 1;
+      unsigned cs_cull_z : 1;
+      unsigned cs_halfz_clip_space : 1;
+   } opt;
  };
  
  /* Restore the pack alignment to default. */
@@ -691,232 +693,214 @@ struct si_shader_key {
  
  /* GCN-specific shader info. */
  struct si_shader_binary_info {
-       ubyte                   vs_output_param_offset[SI_MAX_VS_OUTPUTS];
-       ubyte                   num_input_sgprs;
-       ubyte                   num_input_vgprs;
-       signed char             face_vgpr_index;
-       signed char             ancillary_vgpr_index;
-       bool                    uses_instanceid;
-       ubyte                   nr_pos_exports;
-       ubyte                   nr_param_exports;
-       unsigned                private_mem_vgprs;
-       unsigned                max_simd_waves;
+   ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS];
+   ubyte num_input_sgprs;
+   ubyte num_input_vgprs;
+   signed char face_vgpr_index;
+   signed char ancillary_vgpr_index;
+   bool uses_instanceid;
+   ubyte nr_pos_exports;
+   ubyte nr_param_exports;
+   unsigned private_mem_vgprs;
+   unsigned max_simd_waves;
  };
  
  struct si_shader_binary {
-       const char *elf_buffer;
-       size_t elf_size;
+   const char *elf_buffer;
+   size_t elf_size;
  
-       char *llvm_ir_string;
+   char *llvm_ir_string;
  };
  
  struct gfx9_gs_info {
-       unsigned es_verts_per_subgroup;
-       unsigned gs_prims_per_subgroup;
-       unsigned gs_inst_prims_in_subgroup;
-       unsigned max_prims_per_subgroup;
-       unsigned esgs_ring_size; /* in bytes */
+   unsigned es_verts_per_subgroup;
+   unsigned gs_prims_per_subgroup;
+   unsigned gs_inst_prims_in_subgroup;
+   unsigned max_prims_per_subgroup;
+   unsigned esgs_ring_size; /* in bytes */
  };
  
  struct si_shader {
-       struct si_compiler_ctx_state    compiler_ctx_state;
-
-       struct si_shader_selector       *selector;
-       struct si_shader_selector       *previous_stage_sel; /* for refcounting */
-       struct si_shader                *next_variant;
-
-       struct si_shader_part           *prolog;
-       struct si_shader                *previous_stage; /* for GFX9 */
-       struct si_shader_part           *prolog2;
-       struct si_shader_part           *epilog;
-
-       struct si_pm4_state             *pm4;
-       struct si_resource              *bo;
-       struct si_resource              *scratch_bo;
-       struct si_shader_key            key;
-       struct util_queue_fence         ready;
-       bool                            compilation_failed;
-       bool                            is_monolithic;
-       bool                            is_optimized;
-       bool                            is_binary_shared;
-       bool                            is_gs_copy_shader;
-
-       /* The following data is all that's needed for binary shaders. */
-       struct si_shader_binary         binary;
-       struct ac_shader_config         config;
-       struct si_shader_binary_info    info;
-
-       struct {
-               uint16_t ngg_emit_size; /* in dwords */
-               uint16_t hw_max_esverts;
-               uint16_t max_gsprims;
-               uint16_t max_out_verts;
-               uint16_t prim_amp_factor;
-               bool max_vert_out_per_gs_instance;
-       } ngg;
-
-       /* Shader key + LLVM IR + disassembly + statistics.
-        * Generated for debug contexts only.
-        */
-       char                            *shader_log;
-       size_t                          shader_log_size;
-
-       struct gfx9_gs_info gs_info;
-
-       /* For save precompute context registers values. */
-       union {
-               struct {
-                       unsigned        vgt_gsvs_ring_offset_1;
-                       unsigned        vgt_gsvs_ring_offset_2;
-                       unsigned        vgt_gsvs_ring_offset_3;
-                       unsigned        vgt_gsvs_ring_itemsize;
-                       unsigned        vgt_gs_max_vert_out;
-                       unsigned        vgt_gs_vert_itemsize;
-                       unsigned        vgt_gs_vert_itemsize_1;
-                       unsigned        vgt_gs_vert_itemsize_2;
-                       unsigned        vgt_gs_vert_itemsize_3;
-                       unsigned        vgt_gs_instance_cnt;
-                       unsigned        vgt_gs_onchip_cntl;
-                       unsigned        vgt_gs_max_prims_per_subgroup;
-                       unsigned        vgt_esgs_ring_itemsize;
-               } gs;
-
-               struct {
-                       unsigned        ge_max_output_per_subgroup;
-                       unsigned        ge_ngg_subgrp_cntl;
-                       unsigned        vgt_primitiveid_en;
-                       unsigned        vgt_gs_onchip_cntl;
-                       unsigned        vgt_gs_instance_cnt;
-                       unsigned        vgt_esgs_ring_itemsize;
-                       unsigned        spi_vs_out_config;
-                       unsigned        spi_shader_idx_format;
-                       unsigned        spi_shader_pos_format;
-                       unsigned        pa_cl_vte_cntl;
-                       unsigned        pa_cl_ngg_cntl;
-                       unsigned        vgt_gs_max_vert_out; /* for API GS */
-                       unsigned        ge_pc_alloc; /* uconfig register */
-               } ngg;
-
-               struct {
-                       unsigned        vgt_gs_mode;
-                       unsigned        vgt_primitiveid_en;
-                       unsigned        vgt_reuse_off;
-                       unsigned        spi_vs_out_config;
-                       unsigned        spi_shader_pos_format;
-                       unsigned        pa_cl_vte_cntl;
-                       unsigned        ge_pc_alloc; /* uconfig register */
-               } vs;
-
-               struct {
-                       unsigned        spi_ps_input_ena;
-                       unsigned        spi_ps_input_addr;
-                       unsigned        spi_baryc_cntl;
-                       unsigned        spi_ps_in_control;
-                       unsigned        spi_shader_z_format;
-                       unsigned        spi_shader_col_format;
-                       unsigned        cb_shader_mask;
-               } ps;
-       } ctx_reg;
-
-       /*For save precompute registers value */
-       unsigned vgt_tf_param; /* VGT_TF_PARAM */
-       unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
-       unsigned pa_cl_vs_out_cntl;
-       unsigned ge_cntl;
+   struct si_compiler_ctx_state compiler_ctx_state;
+
+   struct si_shader_selector *selector;
+   struct si_shader_selector *previous_stage_sel; /* for refcounting */
+   struct si_shader *next_variant;
+
+   struct si_shader_part *prolog;
+   struct si_shader *previous_stage; /* for GFX9 */
+   struct si_shader_part *prolog2;
+   struct si_shader_part *epilog;
+
+   struct si_pm4_state *pm4;
+   struct si_resource *bo;
+   struct si_resource *scratch_bo;
+   struct si_shader_key key;
+   struct util_queue_fence ready;
+   bool compilation_failed;
+   bool is_monolithic;
+   bool is_optimized;
+   bool is_binary_shared;
+   bool is_gs_copy_shader;
+
+   /* The following data is all that's needed for binary shaders. */
+   struct si_shader_binary binary;
+   struct ac_shader_config config;
+   struct si_shader_binary_info info;
+
+   struct {
+      uint16_t ngg_emit_size; /* in dwords */
+      uint16_t hw_max_esverts;
+      uint16_t max_gsprims;
+      uint16_t max_out_verts;
+      uint16_t prim_amp_factor;
+      bool max_vert_out_per_gs_instance;
+   } ngg;
+
+   /* Shader key + LLVM IR + disassembly + statistics.
+    * Generated for debug contexts only.
+    */
+   char *shader_log;
+   size_t shader_log_size;
+
+   struct gfx9_gs_info gs_info;
+
+   /* For save precompute context registers values. */
+   union {
+      struct {
+         unsigned vgt_gsvs_ring_offset_1;
+         unsigned vgt_gsvs_ring_offset_2;
+         unsigned vgt_gsvs_ring_offset_3;
+         unsigned vgt_gsvs_ring_itemsize;
+         unsigned vgt_gs_max_vert_out;
+         unsigned vgt_gs_vert_itemsize;
+         unsigned vgt_gs_vert_itemsize_1;
+         unsigned vgt_gs_vert_itemsize_2;
+         unsigned vgt_gs_vert_itemsize_3;
+         unsigned vgt_gs_instance_cnt;
+         unsigned vgt_gs_onchip_cntl;
+         unsigned vgt_gs_max_prims_per_subgroup;
+         unsigned vgt_esgs_ring_itemsize;
+      } gs;
+
+      struct {
+         unsigned ge_max_output_per_subgroup;
+         unsigned ge_ngg_subgrp_cntl;
+         unsigned vgt_primitiveid_en;
+         unsigned vgt_gs_onchip_cntl;
+         unsigned vgt_gs_instance_cnt;
+         unsigned vgt_esgs_ring_itemsize;
+         unsigned spi_vs_out_config;
+         unsigned spi_shader_idx_format;
+         unsigned spi_shader_pos_format;
+         unsigned pa_cl_vte_cntl;
+         unsigned pa_cl_ngg_cntl;
+         unsigned vgt_gs_max_vert_out; /* for API GS */
+         unsigned ge_pc_alloc;         /* uconfig register */
+      } ngg;
+
+      struct {
+         unsigned vgt_gs_mode;
+         unsigned vgt_primitiveid_en;
+         unsigned vgt_reuse_off;
+         unsigned spi_vs_out_config;
+         unsigned spi_shader_pos_format;
+         unsigned pa_cl_vte_cntl;
+         unsigned ge_pc_alloc; /* uconfig register */
+      } vs;
+
+      struct {
+         unsigned spi_ps_input_ena;
+         unsigned spi_ps_input_addr;
+         unsigned spi_baryc_cntl;
+         unsigned spi_ps_in_control;
+         unsigned spi_shader_z_format;
+         unsigned spi_shader_col_format;
+         unsigned cb_shader_mask;
+      } ps;
+   } ctx_reg;
+
+   /*For save precompute registers value */
+   unsigned vgt_tf_param;                /* VGT_TF_PARAM */
+   unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
+   unsigned pa_cl_vs_out_cntl;
+   unsigned ge_cntl;
  };
  
  struct si_shader_part {
-       struct si_shader_part *next;
-       union si_shader_part_key key;
-       struct si_shader_binary binary;
-       struct ac_shader_config config;
+   struct si_shader_part *next;
+   union si_shader_part_key key;
+   struct si_shader_binary binary;
+   struct ac_shader_config config;
  };
  
  /* si_shader.c */
-bool si_compile_shader(struct si_screen *sscreen,
-                      struct ac_llvm_compiler *compiler,
-                      struct si_shader *shader,
-                      struct pipe_debug_callback *debug);
-bool si_create_shader_variant(struct si_screen *sscreen,
-                             struct ac_llvm_compiler *compiler,
-                             struct si_shader *shader,
-                             struct pipe_debug_callback *debug);
+bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                       struct si_shader *shader, struct pipe_debug_callback *debug);
+bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                              struct si_shader *shader, struct pipe_debug_callback *debug);
  void si_shader_destroy(struct si_shader *shader);
  unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index);
-unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
-                                      unsigned is_varying);
+unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, unsigned is_varying);
  bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
-                            uint64_t scratch_va);
+                             uint64_t scratch_va);
  void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
-                   struct pipe_debug_callback *debug,
-                   FILE *f, bool check_debug_option);
-void si_shader_dump_stats_for_shader_db(struct si_screen *screen,
-                                       struct si_shader *shader,
-                                       struct pipe_debug_callback *debug);
-void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
-                                     unsigned *lds_size);
+                    struct pipe_debug_callback *debug, FILE *f, bool check_debug_option);
+void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
+                                        struct pipe_debug_callback *debug);
+void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size);
  const char *si_get_shader_name(const struct si_shader *shader);
  void si_shader_binary_clean(struct si_shader_binary *binary);
  
  /* si_shader_llvm_gs.c */
-struct si_shader *
-si_generate_gs_copy_shader(struct si_screen *sscreen,
-                          struct ac_llvm_compiler *compiler,
-                          struct si_shader_selector *gs_selector,
-                          struct pipe_debug_callback *debug);
+struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
+                                             struct ac_llvm_compiler *compiler,
+                                             struct si_shader_selector *gs_selector,
+                                             struct pipe_debug_callback *debug);
  
  /* si_shader_nir.c */
-void si_nir_scan_shader(const struct nir_shader *nir,
-                       struct si_shader_info *info);
+void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info);
  void si_nir_adjust_driver_locations(struct nir_shader *nir);
  void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize);
  
  /* si_state_shaders.c */
-void gfx9_get_gs_info(struct si_shader_selector *es,
-                     struct si_shader_selector *gs,
-                     struct gfx9_gs_info *out);
+void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
+                      struct gfx9_gs_info *out);
  
  /* Inline helpers. */
  
  /* Return the pointer to the main shader part's pointer. */
-static inline struct si_shader **
-si_get_main_shader_part(struct si_shader_selector *sel,
-                       struct si_shader_key *key)
+static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
+                                                         struct si_shader_key *key)
  {
-       if (key->as_ls)
-               return &sel->main_shader_part_ls;
-       if (key->as_es && key->as_ngg)
-               return &sel->main_shader_part_ngg_es;
-       if (key->as_es)
-               return &sel->main_shader_part_es;
-       if (key->as_ngg)
-               return &sel->main_shader_part_ngg;
-       return &sel->main_shader_part;
+   if (key->as_ls)
+      return &sel->main_shader_part_ls;
+   if (key->as_es && key->as_ngg)
+      return &sel->main_shader_part_ngg_es;
+   if (key->as_es)
+      return &sel->main_shader_part_es;
+   if (key->as_ngg)
+      return &sel->main_shader_part_ngg;
+   return &sel->main_shader_part;
  }
  
-static inline bool
-gfx10_is_ngg_passthrough(struct si_shader *shader)
+static inline bool gfx10_is_ngg_passthrough(struct si_shader *shader)
  {
-       struct si_shader_selector *sel = shader->selector;
-
-       return sel->type != PIPE_SHADER_GEOMETRY &&
-              !sel->so.num_outputs &&
-              !sel->info.writes_edgeflag &&
-              !shader->key.opt.ngg_culling &&
-              (sel->type != PIPE_SHADER_VERTEX ||
-               !shader->key.mono.u.vs_export_prim_id);
+   struct si_shader_selector *sel = shader->selector;
+
+   return sel->type != PIPE_SHADER_GEOMETRY && !sel->so.num_outputs && !sel->info.writes_edgeflag &&
+          !shader->key.opt.ngg_culling &&
+          (sel->type != PIPE_SHADER_VERTEX || !shader->key.mono.u.vs_export_prim_id);
  }
  
-static inline bool
-si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
+static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
  {
-       return selector ? selector->info.uses_bindless_samplers : false;
+   return selector ? selector->info.uses_bindless_samplers : false;
  }
  
-static inline bool
-si_shader_uses_bindless_images(struct si_shader_selector *selector)
+static inline bool si_shader_uses_bindless_images(struct si_shader_selector *selector)
  {
-       return selector ? selector->info.uses_bindless_images : false;
+   return selector ? selector->info.uses_bindless_images : false;
  }
  
  #endif
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h

index 47173142d4424102ec575ab797bb3257a7a78b05..2191604b706f3135bd29548c964483cc066ec747 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -25,8 +25,8 @@
  #ifndef SI_SHADER_PRIVATE_H
  #define SI_SHADER_PRIVATE_H
  
-#include "si_shader.h"
  #include "ac_shader_abi.h"
+#include "si_shader.h"
  
  struct pipe_debug_callback;
  
@@ -38,275 +38,245 @@ struct pipe_debug_callback;
  #define PS_EPILOG_SAMPLEMASK_MIN_LOC 14
  
  struct si_shader_output_values {
-       LLVMValueRef values[4];
-       unsigned semantic_name;
-       unsigned semantic_index;
-       ubyte vertex_stream[4];
+   LLVMValueRef values[4];
+   unsigned semantic_name;
+   unsigned semantic_index;
+   ubyte vertex_stream[4];
  };
  
  struct si_shader_context {
-       struct ac_llvm_context ac;
-       struct si_shader *shader;
-       struct si_screen *screen;
-
-       unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
-
-       /* For clamping the non-constant index in resource indexing: */
-       unsigned num_const_buffers;
-       unsigned num_shader_buffers;
-       unsigned num_images;
-       unsigned num_samplers;
-
-       struct ac_shader_args args;
-       struct ac_shader_abi abi;
-
-       LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
-
-       LLVMBasicBlockRef merged_wrap_if_entry_block;
-       int merged_wrap_if_label;
-
-       LLVMValueRef main_fn;
-       LLVMTypeRef return_type;
-
-       struct ac_arg const_and_shader_buffers;
-       struct ac_arg samplers_and_images;
-
-       /* For merged shaders, the per-stage descriptors for the stage other
-        * than the one we're processing, used to pass them through from the
-        * first stage to the second.
-        */
-       struct ac_arg other_const_and_shader_buffers;
-       struct ac_arg other_samplers_and_images;
-
-       struct ac_arg rw_buffers;
-       struct ac_arg bindless_samplers_and_images;
-       /* Common inputs for merged shaders. */
-       struct ac_arg merged_wave_info;
-       struct ac_arg merged_scratch_offset;
-       struct ac_arg small_prim_cull_info;
-       /* API VS */
-       struct ac_arg vertex_buffers;
-       struct ac_arg vb_descriptors[5];
-       struct ac_arg rel_auto_id;
-       struct ac_arg vs_prim_id;
-       struct ac_arg vertex_index0;
-       /* VS states and layout of LS outputs / TCS inputs at the end
-        *   [0] = clamp vertex color
-        *   [1] = indexed
-        *   [2:3] = NGG: output primitive type
-        *   [4:5] = NGG: provoking vertex index
-        *   [6]   = NGG: streamout queries enabled
-        *   [7:10] = NGG: small prim filter precision = num_samples / quant_mode,
-        *            but in reality it's: 1/2^n, from 1/16 to 1/4096 = 1/2^4 to 1/2^12
-        *            Only the first 4 bits of the exponent are stored.
-        *            Set it like this: (fui(num_samples / quant_mode) >> 23)
-        *            Expand to FP32 like this: ((0x70 | value) << 23);
-        *            With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15)
-        *            = 1/2^(15 - value) in FP32
-        *   [11:23] = stride between patches in DW = num_inputs * num_vertices * 4
-        *             max = 32*32*4 + 32*4
-        *   [24:31] = stride between vertices in DW = num_inputs * 4
-        *             max = 32*4
-        */
-       struct ac_arg vs_state_bits;
-       struct ac_arg vs_blit_inputs;
-       struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */
-       /* HW VS */
-       struct ac_arg streamout_config;
-       struct ac_arg streamout_write_index;
-       struct ac_arg streamout_offset[4];
-
-       /* API TCS & TES */
-       /* Layout of TCS outputs in the offchip buffer
-        * # 6 bits
-        *   [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40)
-        * # 6 bits
-        *   [6:11] = the number of output vertices per patch, max = 32
-        * # 20 bits
-        *   [12:31] = the offset of per patch attributes in the buffer in bytes.
-        *             max = NUM_PATCHES*32*32*16
-        */
-       struct ac_arg tcs_offchip_layout;
-
-       /* API TCS */
-       /* Offsets where TCS outputs and TCS patch outputs live in LDS:
-        *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
-        *   [16:31] = TCS output patch0 offset for per-patch / 16
-        *             max = (NUM_PATCHES + 1) * 32*32
-        */
-       struct ac_arg tcs_out_lds_offsets;
-       /* Layout of TCS outputs / TES inputs:
-        *   [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
-        *            max = 32*32*4 + 32*4
-        *   [13:18] = gl_PatchVerticesIn, max = 32
-        *   [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers
-        */
-       struct ac_arg tcs_out_lds_layout;
-       struct ac_arg tcs_offchip_offset;
-       struct ac_arg tcs_factor_offset;
-
-       /* API TES */
-       struct ac_arg tes_offchip_addr;
-       struct ac_arg tes_u;
-       struct ac_arg tes_v;
-       struct ac_arg tes_rel_patch_id;
-       /* HW ES */
-       struct ac_arg es2gs_offset;
-       /* HW GS */
-       /* On gfx10:
-        *  - bits 0..11: ordered_wave_id
-        *  - bits 12..20: number of vertices in group
-        *  - bits 22..30: number of primitives in group
-        */
-       struct ac_arg gs_tg_info;
-       /* API GS */
-       struct ac_arg gs2vs_offset;
-       struct ac_arg gs_wave_id; /* GFX6 */
-       struct ac_arg gs_vtx_offset[6]; /* in dwords (GFX6) */
-       struct ac_arg gs_vtx01_offset; /* in dwords (GFX9) */
-       struct ac_arg gs_vtx23_offset; /* in dwords (GFX9) */
-       struct ac_arg gs_vtx45_offset; /* in dwords (GFX9) */
-       /* PS */
-       struct ac_arg pos_fixed_pt;
-       /* CS */
-       struct ac_arg block_size;
-       struct ac_arg cs_user_data;
-
-       struct ac_llvm_compiler *compiler;
-
-       /* Preloaded descriptors. */
-       LLVMValueRef esgs_ring;
-       LLVMValueRef gsvs_ring[4];
-       LLVMValueRef tess_offchip_ring;
-
-       LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
-       LLVMValueRef gs_next_vertex[4];
-       LLVMValueRef gs_curprim_verts[4];
-       LLVMValueRef gs_generated_prims[4];
-       LLVMValueRef gs_ngg_emit;
-       LLVMValueRef gs_ngg_scratch;
-       LLVMValueRef postponed_kill;
-       LLVMValueRef return_value;
+   struct ac_llvm_context ac;
+   struct si_shader *shader;
+   struct si_screen *screen;
+
+   unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
+
+   /* For clamping the non-constant index in resource indexing: */
+   unsigned num_const_buffers;
+   unsigned num_shader_buffers;
+   unsigned num_images;
+   unsigned num_samplers;
+
+   struct ac_shader_args args;
+   struct ac_shader_abi abi;
+
+   LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
+
+   LLVMBasicBlockRef merged_wrap_if_entry_block;
+   int merged_wrap_if_label;
+
+   LLVMValueRef main_fn;
+   LLVMTypeRef return_type;
+
+   struct ac_arg const_and_shader_buffers;
+   struct ac_arg samplers_and_images;
+
+   /* For merged shaders, the per-stage descriptors for the stage other
+    * than the one we're processing, used to pass them through from the
+    * first stage to the second.
+    */
+   struct ac_arg other_const_and_shader_buffers;
+   struct ac_arg other_samplers_and_images;
+
+   struct ac_arg rw_buffers;
+   struct ac_arg bindless_samplers_and_images;
+   /* Common inputs for merged shaders. */
+   struct ac_arg merged_wave_info;
+   struct ac_arg merged_scratch_offset;
+   struct ac_arg small_prim_cull_info;
+   /* API VS */
+   struct ac_arg vertex_buffers;
+   struct ac_arg vb_descriptors[5];
+   struct ac_arg rel_auto_id;
+   struct ac_arg vs_prim_id;
+   struct ac_arg vertex_index0;
+   /* VS states and layout of LS outputs / TCS inputs at the end
+    *   [0] = clamp vertex color
+    *   [1] = indexed
+    *   [2:3] = NGG: output primitive type
+    *   [4:5] = NGG: provoking vertex index
+    *   [6]   = NGG: streamout queries enabled
+    *   [7:10] = NGG: small prim filter precision = num_samples / quant_mode,
+    *            but in reality it's: 1/2^n, from 1/16 to 1/4096 = 1/2^4 to 1/2^12
+    *            Only the first 4 bits of the exponent are stored.
+    *            Set it like this: (fui(num_samples / quant_mode) >> 23)
+    *            Expand to FP32 like this: ((0x70 | value) << 23);
+    *            With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15)
+    *            = 1/2^(15 - value) in FP32
+    *   [11:23] = stride between patches in DW = num_inputs * num_vertices * 4
+    *             max = 32*32*4 + 32*4
+    *   [24:31] = stride between vertices in DW = num_inputs * 4
+    *             max = 32*4
+    */
+   struct ac_arg vs_state_bits;
+   struct ac_arg vs_blit_inputs;
+   struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */
+   /* HW VS */
+   struct ac_arg streamout_config;
+   struct ac_arg streamout_write_index;
+   struct ac_arg streamout_offset[4];
+
+   /* API TCS & TES */
+   /* Layout of TCS outputs in the offchip buffer
+    * # 6 bits
+    *   [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40)
+    * # 6 bits
+    *   [6:11] = the number of output vertices per patch, max = 32
+    * # 20 bits
+    *   [12:31] = the offset of per patch attributes in the buffer in bytes.
+    *             max = NUM_PATCHES*32*32*16
+    */
+   struct ac_arg tcs_offchip_layout;
+
+   /* API TCS */
+   /* Offsets where TCS outputs and TCS patch outputs live in LDS:
+    *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
+    *   [16:31] = TCS output patch0 offset for per-patch / 16
+    *             max = (NUM_PATCHES + 1) * 32*32
+    */
+   struct ac_arg tcs_out_lds_offsets;
+   /* Layout of TCS outputs / TES inputs:
+    *   [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
+    *            max = 32*32*4 + 32*4
+    *   [13:18] = gl_PatchVerticesIn, max = 32
+    *   [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers
+    */
+   struct ac_arg tcs_out_lds_layout;
+   struct ac_arg tcs_offchip_offset;
+   struct ac_arg tcs_factor_offset;
+
+   /* API TES */
+   struct ac_arg tes_offchip_addr;
+   struct ac_arg tes_u;
+   struct ac_arg tes_v;
+   struct ac_arg tes_rel_patch_id;
+   /* HW ES */
+   struct ac_arg es2gs_offset;
+   /* HW GS */
+   /* On gfx10:
+    *  - bits 0..11: ordered_wave_id
+    *  - bits 12..20: number of vertices in group
+    *  - bits 22..30: number of primitives in group
+    */
+   struct ac_arg gs_tg_info;
+   /* API GS */
+   struct ac_arg gs2vs_offset;
+   struct ac_arg gs_wave_id;       /* GFX6 */
+   struct ac_arg gs_vtx_offset[6]; /* in dwords (GFX6) */
+   struct ac_arg gs_vtx01_offset;  /* in dwords (GFX9) */
+   struct ac_arg gs_vtx23_offset;  /* in dwords (GFX9) */
+   struct ac_arg gs_vtx45_offset;  /* in dwords (GFX9) */
+   /* PS */
+   struct ac_arg pos_fixed_pt;
+   /* CS */
+   struct ac_arg block_size;
+   struct ac_arg cs_user_data;
+
+   struct ac_llvm_compiler *compiler;
+
+   /* Preloaded descriptors. */
+   LLVMValueRef esgs_ring;
+   LLVMValueRef gsvs_ring[4];
+   LLVMValueRef tess_offchip_ring;
+
+   LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
+   LLVMValueRef gs_next_vertex[4];
+   LLVMValueRef gs_curprim_verts[4];
+   LLVMValueRef gs_generated_prims[4];
+   LLVMValueRef gs_ngg_emit;
+   LLVMValueRef gs_ngg_scratch;
+   LLVMValueRef postponed_kill;
+   LLVMValueRef return_value;
  };
  
-static inline struct si_shader_context *
-si_shader_context_from_abi(struct ac_shader_abi *abi)
+static inline struct si_shader_context *si_shader_context_from_abi(struct ac_shader_abi *abi)
  {
-       struct si_shader_context *ctx = NULL;
-       return container_of(abi, ctx, abi);
+   struct si_shader_context *ctx = NULL;
+   return container_of(abi, ctx, abi);
  }
  
  bool si_is_multi_part_shader(struct si_shader *shader);
  bool si_is_merged_shader(struct si_shader *shader);
-void si_add_arg_checked(struct ac_shader_args *args,
-                       enum ac_arg_regfile file,
-                       unsigned registers, enum ac_arg_type type,
-                       struct ac_arg *arg,
-                       unsigned idx);
+void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
+                        enum ac_arg_type type, struct ac_arg *arg, unsigned idx);
  unsigned si_get_max_workgroup_size(const struct si_shader *shader);
  bool si_need_ps_prolog(const union si_shader_part_key *key);
-void si_get_ps_prolog_key(struct si_shader *shader,
-                         union si_shader_part_key *key,
-                         bool separate_prolog);
-void si_get_ps_epilog_key(struct si_shader *shader,
-                         union si_shader_part_key *key);
+void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
+                          bool separate_prolog);
+void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key);
  void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
  void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader);
  
  bool gfx10_ngg_export_prim_early(struct si_shader *shader);
  void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx);
-void gfx10_ngg_build_export_prim(struct si_shader_context *ctx,
-                                LLVMValueRef user_edgeflags[3],
-                                LLVMValueRef prim_passthrough);
-void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
-                                              unsigned max_outputs,
-                                              LLVMValueRef *addrs);
-void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
-                            unsigned max_outputs,
-                            LLVMValueRef *addrs);
-void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
-                             unsigned stream,
-                             LLVMValueRef *addrs);
+void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
+                                 LLVMValueRef prim_passthrough);
+void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs,
+                                               LLVMValueRef *addrs);
+void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs);
  void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx);
  void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx);
  void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader);
  
  /* si_shader_llvm.c */
-bool si_compile_llvm(struct si_screen *sscreen,
-                    struct si_shader_binary *binary,
-                    struct ac_shader_config *conf,
-                    struct ac_llvm_compiler *compiler,
-                    struct ac_llvm_context *ac,
-                    struct pipe_debug_callback *debug,
-                    enum pipe_shader_type shader_type,
-                    const char *name,
-                    bool less_optimized);
-void si_llvm_context_init(struct si_shader_context *ctx,
-                         struct si_screen *sscreen,
-                         struct ac_llvm_compiler *compiler,
-                         unsigned wave_size);
-void si_llvm_create_func(struct si_shader_context *ctx, const char *name,
-                        LLVMTypeRef *return_types, unsigned num_return_elems,
-                        unsigned max_workgroup_size);
+bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
+                     struct ac_shader_config *conf, struct ac_llvm_compiler *compiler,
+                     struct ac_llvm_context *ac, struct pipe_debug_callback *debug,
+                     enum pipe_shader_type shader_type, const char *name, bool less_optimized);
+void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen,
+                          struct ac_llvm_compiler *compiler, unsigned wave_size);
+void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
+                         unsigned num_return_elems, unsigned max_workgroup_size);
  void si_llvm_optimize_module(struct si_shader_context *ctx);
  void si_llvm_dispose(struct si_shader_context *ctx);
-LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx,
-                                 LLVMValueRef resource, LLVMValueRef offset);
+LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
+                                  LLVMValueRef offset);
  void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret);
  LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
-                                struct ac_arg param, unsigned return_index);
+                                 struct ac_arg param, unsigned return_index);
  LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
-                                      struct ac_arg param, unsigned return_index);
+                                       struct ac_arg param, unsigned return_index);
  LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
-                                struct ac_arg param, unsigned return_index);
+                                 struct ac_arg param, unsigned return_index);
  LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx);
-LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx,
-                                  LLVMTypeRef type, LLVMValueRef val1,
-                                  LLVMValueRef val2);
+LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type,
+                                   LLVMValueRef val1, LLVMValueRef val2);
  void si_llvm_emit_barrier(struct si_shader_context *ctx);
  void si_llvm_declare_esgs_ring(struct si_shader_context *ctx);
  void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
-                            unsigned bitoffset);
-LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
-                            struct ac_arg param, unsigned rshift,
-                            unsigned bitwidth);
-LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
-                                unsigned swizzle);
+                             unsigned bitoffset);
+LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
+                             unsigned bitwidth);
+LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle);
  LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi);
  void si_llvm_declare_compute_memory(struct si_shader_context *ctx);
  bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir);
  void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
-                              unsigned num_parts, unsigned main_part,
-                              unsigned next_shader_first_part);
+                               unsigned num_parts, unsigned main_part,
+                               unsigned next_shader_first_part);
  
  /* si_shader_llvm_gs.c */
  LLVMValueRef si_is_es_thread(struct si_shader_context *ctx);
  LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx);
-void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-                             LLVMValueRef *addrs);
+void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
  void si_preload_esgs_ring(struct si_shader_context *ctx);
  void si_preload_gs_rings(struct si_shader_context *ctx);
-void si_llvm_build_gs_prolog(struct si_shader_context *ctx,
-                            union si_shader_part_key *key);
+void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
  void si_llvm_init_gs_callbacks(struct si_shader_context *ctx);
  
  /* si_shader_llvm_tess.c */
  void si_llvm_preload_tes_rings(struct si_shader_context *ctx);
-void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-                             LLVMValueRef *addrs);
-void si_llvm_build_tcs_epilog(struct si_shader_context *ctx,
-                             union si_shader_part_key *key);
+void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key);
  void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx);
  void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
  
  /* si_shader_llvm_ps.c */
  LLVMValueRef si_get_sample_id(struct si_shader_context *ctx);
-void si_llvm_build_ps_prolog(struct si_shader_context *ctx,
-                            union si_shader_part_key *key);
-void si_llvm_build_ps_epilog(struct si_shader_context *ctx,
-                            union si_shader_part_key *key);
-void si_llvm_build_monolithic_ps(struct si_shader_context *ctx,
-                                struct si_shader *shader);
+void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
+void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key);
+void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader *shader);
  void si_llvm_init_ps_callbacks(struct si_shader_context *ctx);
  
  /* si_shader_llvm_resources.c */
@@ -314,21 +284,16 @@ void si_llvm_init_resource_callbacks(struct si_shader_context *ctx);
  
  /* si_shader_llvm_vs.c */
  void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir);
-void si_llvm_streamout_store_output(struct si_shader_context *ctx,
-                                   LLVMValueRef const *so_buffers,
-                                   LLVMValueRef const *so_write_offsets,
-                                   struct pipe_stream_output *stream_out,
-                                   struct si_shader_output_values *shader_out);
-void si_llvm_emit_streamout(struct si_shader_context *ctx,
-                           struct si_shader_output_values *outputs,
-                           unsigned noutput, unsigned stream);
+void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
+                                    LLVMValueRef const *so_write_offsets,
+                                    struct pipe_stream_output *stream_out,
+                                    struct si_shader_output_values *shader_out);
+void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
+                            unsigned noutput, unsigned stream);
  void si_llvm_build_vs_exports(struct si_shader_context *ctx,
-                             struct si_shader_output_values *outputs,
-                             unsigned noutput);
-void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-                             LLVMValueRef *addrs);
-void si_llvm_build_vs_prolog(struct si_shader_context *ctx,
-                            union si_shader_part_key *key);
+                              struct si_shader_output_values *outputs, unsigned noutput);
+void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
  void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
  
  #endif
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c

index dca604afe40e35eaa94c1973f2aeaf60004f7cb3..d8bcb4ad55ce55f752b46731d13b90db5512c8d8 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -22,298 +22,272 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "si_shader_internal.h"
-#include "si_pipe.h"
-#include "ac_rtld.h"
  #include "ac_nir_to_llvm.h"
+#include "ac_rtld.h"
+#include "si_pipe.h"
+#include "si_shader_internal.h"
  #include "sid.h"
-
  #include "tgsi/tgsi_from_mesa.h"
  #include "util/u_memory.h"
  
  struct si_llvm_diagnostics {
-       struct pipe_debug_callback *debug;
-       unsigned retval;
+   struct pipe_debug_callback *debug;
+   unsigned retval;
  };
  
  static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
  {
-       struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context;
-       LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
-       const char *severity_str = NULL;
-
-       switch (severity) {
-       case LLVMDSError:
-               severity_str = "error";
-               break;
-       case LLVMDSWarning:
-               severity_str = "warning";
-               break;
-       case LLVMDSRemark:
-       case LLVMDSNote:
-       default:
-               return;
-       }
-
-       char *description = LLVMGetDiagInfoDescription(di);
-
-       pipe_debug_message(diag->debug, SHADER_INFO,
-                          "LLVM diagnostic (%s): %s", severity_str, description);
-
-       if (severity == LLVMDSError) {
-               diag->retval = 1;
-               fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", description);
-       }
-
-       LLVMDisposeMessage(description);
+   struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context;
+   LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
+   const char *severity_str = NULL;
+
+   switch (severity) {
+   case LLVMDSError:
+      severity_str = "error";
+      break;
+   case LLVMDSWarning:
+      severity_str = "warning";
+      break;
+   case LLVMDSRemark:
+   case LLVMDSNote:
+   default:
+      return;
+   }
+
+   char *description = LLVMGetDiagInfoDescription(di);
+
+   pipe_debug_message(diag->debug, SHADER_INFO, "LLVM diagnostic (%s): %s", severity_str,
+                      description);
+
+   if (severity == LLVMDSError) {
+      diag->retval = 1;
+      fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);
+   }
+
+   LLVMDisposeMessage(description);
  }
  
-bool si_compile_llvm(struct si_screen *sscreen,
-                    struct si_shader_binary *binary,
-                    struct ac_shader_config *conf,
-                    struct ac_llvm_compiler *compiler,
-                    struct ac_llvm_context *ac,
-                    struct pipe_debug_callback *debug,
-                    enum pipe_shader_type shader_type,
-                    const char *name,
-                    bool less_optimized)
+bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
+                     struct ac_shader_config *conf, struct ac_llvm_compiler *compiler,
+                     struct ac_llvm_context *ac, struct pipe_debug_callback *debug,
+                     enum pipe_shader_type shader_type, const char *name, bool less_optimized)
  {
-       unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
-
-       if (si_can_dump_shader(sscreen, shader_type)) {
-               fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
-
-               if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
-                       fprintf(stderr, "%s LLVM IR:\n\n", name);
-                       ac_dump_module(ac->module);
-                       fprintf(stderr, "\n");
-               }
-       }
-
-       if (sscreen->record_llvm_ir) {
-               char *ir = LLVMPrintModuleToString(ac->module);
-               binary->llvm_ir_string = strdup(ir);
-               LLVMDisposeMessage(ir);
-       }
-
-       if (!si_replace_shader(count, binary)) {
-               struct ac_compiler_passes *passes = compiler->passes;
-
-               if (ac->wave_size == 32)
-                       passes = compiler->passes_wave32;
-               else if (less_optimized && compiler->low_opt_passes)
-                       passes = compiler->low_opt_passes;
-
-               struct si_llvm_diagnostics diag = {debug};
-               LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag);
-
-               if (!ac_compile_module_to_elf(passes, ac->module,
-                                             (char **)&binary->elf_buffer,
-                                             &binary->elf_size))
-                       diag.retval = 1;
-
-               if (diag.retval != 0) {
-                       pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed");
-                       return false;
-               }
-       }
-
-       struct ac_rtld_binary rtld;
-       if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
-                       .info = &sscreen->info,
-                       .shader_type = tgsi_processor_to_shader_stage(shader_type),
-                       .wave_size = ac->wave_size,
-                       .num_parts = 1,
-                       .elf_ptrs = &binary->elf_buffer,
-                       .elf_sizes = &binary->elf_size }))
-               return false;
-
-       bool ok = ac_rtld_read_config(&rtld, conf);
-       ac_rtld_close(&rtld);
-       return ok;
+   unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
+
+   if (si_can_dump_shader(sscreen, shader_type)) {
+      fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
+
+      if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
+         fprintf(stderr, "%s LLVM IR:\n\n", name);
+         ac_dump_module(ac->module);
+         fprintf(stderr, "\n");
+      }
+   }
+
+   if (sscreen->record_llvm_ir) {
+      char *ir = LLVMPrintModuleToString(ac->module);
+      binary->llvm_ir_string = strdup(ir);
+      LLVMDisposeMessage(ir);
+   }
+
+   if (!si_replace_shader(count, binary)) {
+      struct ac_compiler_passes *passes = compiler->passes;
+
+      if (ac->wave_size == 32)
+         passes = compiler->passes_wave32;
+      else if (less_optimized && compiler->low_opt_passes)
+         passes = compiler->low_opt_passes;
+
+      struct si_llvm_diagnostics diag = {debug};
+      LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag);
+
+      if (!ac_compile_module_to_elf(passes, ac->module, (char **)&binary->elf_buffer,
+                                    &binary->elf_size))
+         diag.retval = 1;
+
+      if (diag.retval != 0) {
+         pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed");
+         return false;
+      }
+   }
+
+   struct ac_rtld_binary rtld;
+   if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
+                               .info = &sscreen->info,
+                               .shader_type = tgsi_processor_to_shader_stage(shader_type),
+                               .wave_size = ac->wave_size,
+                               .num_parts = 1,
+                               .elf_ptrs = &binary->elf_buffer,
+                               .elf_sizes = &binary->elf_size}))
+      return false;
+
+   bool ok = ac_rtld_read_config(&rtld, conf);
+   ac_rtld_close(&rtld);
+   return ok;
  }
  
-void si_llvm_context_init(struct si_shader_context *ctx,
-                         struct si_screen *sscreen,
-                         struct ac_llvm_compiler *compiler,
-                         unsigned wave_size)
+void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen,
+                          struct ac_llvm_compiler *compiler, unsigned wave_size)
  {
-       memset(ctx, 0, sizeof(*ctx));
-       ctx->screen = sscreen;
-       ctx->compiler = compiler;
-
-       ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class,
-                            sscreen->info.family,
-                            AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH,
-                            wave_size, 64);
+   memset(ctx, 0, sizeof(*ctx));
+   ctx->screen = sscreen;
+   ctx->compiler = compiler;
+
+   ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, sscreen->info.family,
+                        AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH, wave_size, 64);
  }
  
-void si_llvm_create_func(struct si_shader_context *ctx, const char *name,
-                        LLVMTypeRef *return_types, unsigned num_return_elems,
-                        unsigned max_workgroup_size)
+void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
+                         unsigned num_return_elems, unsigned max_workgroup_size)
  {
-       LLVMTypeRef ret_type;
-       enum ac_llvm_calling_convention call_conv;
-       enum pipe_shader_type real_shader_type;
-
-       if (num_return_elems)
-               ret_type = LLVMStructTypeInContext(ctx->ac.context,
-                                                  return_types,
-                                                  num_return_elems, true);
-       else
-               ret_type = ctx->ac.voidt;
-
-       real_shader_type = ctx->type;
-
-       /* LS is merged into HS (TCS), and ES is merged into GS. */
-       if (ctx->screen->info.chip_class >= GFX9) {
-               if (ctx->shader->key.as_ls)
-                       real_shader_type = PIPE_SHADER_TESS_CTRL;
-               else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg)
-                       real_shader_type = PIPE_SHADER_GEOMETRY;
-       }
-
-       switch (real_shader_type) {
-       case PIPE_SHADER_VERTEX:
-       case PIPE_SHADER_TESS_EVAL:
-               call_conv = AC_LLVM_AMDGPU_VS;
-               break;
-       case PIPE_SHADER_TESS_CTRL:
-               call_conv = AC_LLVM_AMDGPU_HS;
-               break;
-       case PIPE_SHADER_GEOMETRY:
-               call_conv = AC_LLVM_AMDGPU_GS;
-               break;
-       case PIPE_SHADER_FRAGMENT:
-               call_conv = AC_LLVM_AMDGPU_PS;
-               break;
-       case PIPE_SHADER_COMPUTE:
-               call_conv = AC_LLVM_AMDGPU_CS;
-               break;
-       default:
-               unreachable("Unhandle shader type");
-       }
-
-       /* Setup the function */
-       ctx->return_type = ret_type;
-       ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name,
-                                    ret_type, ctx->ac.module);
-       ctx->return_value = LLVMGetUndef(ctx->return_type);
-
-       if (ctx->screen->info.address32_hi) {
-               ac_llvm_add_target_dep_function_attr(ctx->main_fn,
-                                                    "amdgpu-32bit-address-high-bits",
-                                                    ctx->screen->info.address32_hi);
-       }
-
-       LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
-                                          "no-signed-zeros-fp-math",
-                                          "true");
-
-       ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size);
+   LLVMTypeRef ret_type;
+   enum ac_llvm_calling_convention call_conv;
+   enum pipe_shader_type real_shader_type;
+
+   if (num_return_elems)
+      ret_type = LLVMStructTypeInContext(ctx->ac.context, return_types, num_return_elems, true);
+   else
+      ret_type = ctx->ac.voidt;
+
+   real_shader_type = ctx->type;
+
+   /* LS is merged into HS (TCS), and ES is merged into GS. */
+   if (ctx->screen->info.chip_class >= GFX9) {
+      if (ctx->shader->key.as_ls)
+         real_shader_type = PIPE_SHADER_TESS_CTRL;
+      else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg)
+         real_shader_type = PIPE_SHADER_GEOMETRY;
+   }
+
+   switch (real_shader_type) {
+   case PIPE_SHADER_VERTEX:
+   case PIPE_SHADER_TESS_EVAL:
+      call_conv = AC_LLVM_AMDGPU_VS;
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      call_conv = AC_LLVM_AMDGPU_HS;
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      call_conv = AC_LLVM_AMDGPU_GS;
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      call_conv = AC_LLVM_AMDGPU_PS;
+      break;
+   case PIPE_SHADER_COMPUTE:
+      call_conv = AC_LLVM_AMDGPU_CS;
+      break;
+   default:
+      unreachable("Unhandle shader type");
+   }
+
+   /* Setup the function */
+   ctx->return_type = ret_type;
+   ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name, ret_type, ctx->ac.module);
+   ctx->return_value = LLVMGetUndef(ctx->return_type);
+
+   if (ctx->screen->info.address32_hi) {
+      ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-32bit-address-high-bits",
+                                           ctx->screen->info.address32_hi);
+   }
+
+   LLVMAddTargetDependentFunctionAttr(ctx->main_fn, "no-signed-zeros-fp-math", "true");
+
+   ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size);
  }
  
  void si_llvm_optimize_module(struct si_shader_context *ctx)
  {
-       /* Dump LLVM IR before any optimization passes */
-       if (ctx->screen->debug_flags & DBG(PREOPT_IR) &&
-           si_can_dump_shader(ctx->screen, ctx->type))
-               LLVMDumpModule(ctx->ac.module);
-
-       /* Run the pass */
-       LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module);
-       LLVMDisposeBuilder(ctx->ac.builder);
+   /* Dump LLVM IR before any optimization passes */
+   if (ctx->screen->debug_flags & DBG(PREOPT_IR) && si_can_dump_shader(ctx->screen, ctx->type))
+      LLVMDumpModule(ctx->ac.module);
+
+   /* Run the pass */
+   LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module);
+   LLVMDisposeBuilder(ctx->ac.builder);
  }
  
  void si_llvm_dispose(struct si_shader_context *ctx)
  {
-       LLVMDisposeModule(ctx->ac.module);
-       LLVMContextDispose(ctx->ac.context);
-       ac_llvm_context_dispose(&ctx->ac);
+   LLVMDisposeModule(ctx->ac.module);
+   LLVMContextDispose(ctx->ac.context);
+   ac_llvm_context_dispose(&ctx->ac);
  }
  
  /**
   * Load a dword from a constant buffer.
   */
-LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx,
-                                 LLVMValueRef resource, LLVMValueRef offset)
+LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
+                                  LLVMValueRef offset)
  {
-       return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
-                                   0, 0, true, true);
+   return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, 0, 0, true, true);
  }
  
  void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
  {
-       if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
-               LLVMBuildRetVoid(ctx->ac.builder);
-       else
-               LLVMBuildRet(ctx->ac.builder, ret);
+   if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
+      LLVMBuildRetVoid(ctx->ac.builder);
+   else
+      LLVMBuildRet(ctx->ac.builder, ret);
  }
  
  LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
-                                struct ac_arg param, unsigned return_index)
+                                 struct ac_arg param, unsigned return_index)
  {
-       return LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                   ac_get_arg(&ctx->ac, param),
-                                   return_index, "");
+   return LLVMBuildInsertValue(ctx->ac.builder, ret, ac_get_arg(&ctx->ac, param), return_index, "");
  }
  
  LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
-                                      struct ac_arg param, unsigned return_index)
+                                       struct ac_arg param, unsigned return_index)
  {
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef p = ac_get_arg(&ctx->ac, param);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef p = ac_get_arg(&ctx->ac, param);
  
-       return LLVMBuildInsertValue(builder, ret,
-                                   ac_to_float(&ctx->ac, p),
-                                   return_index, "");
+   return LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, p), return_index, "");
  }
  
  LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
-                                struct ac_arg param, unsigned return_index)
+                                 struct ac_arg param, unsigned return_index)
  {
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef ptr = ac_get_arg(&ctx->ac, param);
-       ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, "");
-       return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef ptr = ac_get_arg(&ctx->ac, param);
+   ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, "");
+   return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
  }
  
  LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
  {
-       LLVMValueRef ptr[2], list;
-       bool merged_shader = si_is_merged_shader(ctx->shader);
+   LLVMValueRef ptr[2], list;
+   bool merged_shader = si_is_merged_shader(ctx->shader);
  
-       ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
-       list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0],
-                                ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
-       return list;
+   ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
+   list =
+      LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+   return list;
  }
  
-LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx,
-                                  LLVMTypeRef type, LLVMValueRef val1,
-                                  LLVMValueRef val2)
+LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type,
+                                   LLVMValueRef val1, LLVMValueRef val2)
  {
-       LLVMValueRef values[2] = {
-               ac_to_integer(&ctx->ac, val1),
-               ac_to_integer(&ctx->ac, val2),
-       };
-       LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2);
-       return LLVMBuildBitCast(ctx->ac.builder, result, type, "");
+   LLVMValueRef values[2] = {
+      ac_to_integer(&ctx->ac, val1),
+      ac_to_integer(&ctx->ac, val2),
+   };
+   LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2);
+   return LLVMBuildBitCast(ctx->ac.builder, result, type, "");
  }
  
  void si_llvm_emit_barrier(struct si_shader_context *ctx)
  {
-       /* GFX6 only (thanks to a hw bug workaround):
-        * The real barrier instruction isn’t needed, because an entire patch
-        * always fits into a single wave.
-        */
-       if (ctx->screen->info.chip_class == GFX6 &&
-           ctx->type == PIPE_SHADER_TESS_CTRL) {
-               ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE);
-               return;
-       }
-
-       ac_build_s_barrier(&ctx->ac);
+   /* GFX6 only (thanks to a hw bug workaround):
+    * The real barrier instruction isn’t needed, because an entire patch
+    * always fits into a single wave.
+    */
+   if (ctx->screen->info.chip_class == GFX6 && ctx->type == PIPE_SHADER_TESS_CTRL) {
+      ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE);
+      return;
+   }
+
+   ac_build_s_barrier(&ctx->ac);
  }
  
  /* Ensure that the esgs ring is declared.
@@ -323,187 +297,169 @@ void si_llvm_emit_barrier(struct si_shader_context *ctx)
   */
  void si_llvm_declare_esgs_ring(struct si_shader_context *ctx)
  {
-       if (ctx->esgs_ring)
-               return;
+   if (ctx->esgs_ring)
+      return;
  
-       assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
+   assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
  
-       ctx->esgs_ring = LLVMAddGlobalInAddressSpace(
-               ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
-               "esgs_ring",
-               AC_ADDR_SPACE_LDS);
-       LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
-       LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
+   ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
+                                                "esgs_ring", AC_ADDR_SPACE_LDS);
+   LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
+   LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
  }
  
-void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
-                            unsigned bitoffset)
+void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, unsigned bitoffset)
  {
-       LLVMValueRef args[] = {
-               ac_get_arg(&ctx->ac, param),
-               LLVMConstInt(ctx->ac.i32, bitoffset, 0),
-       };
-       ac_build_intrinsic(&ctx->ac,
-                          "llvm.amdgcn.init.exec.from.input",
-                          ctx->ac.voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
+   LLVMValueRef args[] = {
+      ac_get_arg(&ctx->ac, param),
+      LLVMConstInt(ctx->ac.i32, bitoffset, 0),
+   };
+   ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.init.exec.from.input", ctx->ac.voidt, args, 2,
+                      AC_FUNC_ATTR_CONVERGENT);
  }
  
  /**
   * Get the value of a shader input parameter and extract a bitfield.
   */
-static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx,
-                                     LLVMValueRef value, unsigned rshift,
-                                     unsigned bitwidth)
+static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, LLVMValueRef value,
+                                      unsigned rshift, unsigned bitwidth)
  {
-       if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
-               value = ac_to_integer(&ctx->ac, value);
+   if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
+      value = ac_to_integer(&ctx->ac, value);
  
-       if (rshift)
-               value = LLVMBuildLShr(ctx->ac.builder, value,
-                                     LLVMConstInt(ctx->ac.i32, rshift, 0), "");
+   if (rshift)
+      value = LLVMBuildLShr(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, rshift, 0), "");
  
-       if (rshift + bitwidth < 32) {
-               unsigned mask = (1 << bitwidth) - 1;
-               value = LLVMBuildAnd(ctx->ac.builder, value,
-                                    LLVMConstInt(ctx->ac.i32, mask, 0), "");
-       }
+   if (rshift + bitwidth < 32) {
+      unsigned mask = (1 << bitwidth) - 1;
+      value = LLVMBuildAnd(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, mask, 0), "");
+   }
  
-       return value;
+   return value;
  }
  
-LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
-                            struct ac_arg param, unsigned rshift,
-                            unsigned bitwidth)
+LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
+                             unsigned bitwidth)
  {
-       LLVMValueRef value = ac_get_arg(&ctx->ac, param);
+   LLVMValueRef value = ac_get_arg(&ctx->ac, param);
  
-       return unpack_llvm_param(ctx, value, rshift, bitwidth);
+   return unpack_llvm_param(ctx, value, rshift, bitwidth);
  }
  
-LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
-                                unsigned swizzle)
+LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle)
  {
-       if (swizzle > 0)
-               return ctx->ac.i32_0;
-
-       switch (ctx->type) {
-       case PIPE_SHADER_VERTEX:
-               return ac_get_arg(&ctx->ac, ctx->vs_prim_id);
-       case PIPE_SHADER_TESS_CTRL:
-               return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
-       case PIPE_SHADER_TESS_EVAL:
-               return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
-       case PIPE_SHADER_GEOMETRY:
-               return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
-       default:
-               assert(0);
-               return ctx->ac.i32_0;
-       }
+   if (swizzle > 0)
+      return ctx->ac.i32_0;
+
+   switch (ctx->type) {
+   case PIPE_SHADER_VERTEX:
+      return ac_get_arg(&ctx->ac, ctx->vs_prim_id);
+   case PIPE_SHADER_TESS_CTRL:
+      return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
+   case PIPE_SHADER_TESS_EVAL:
+      return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
+   case PIPE_SHADER_GEOMETRY:
+      return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
+   default:
+      assert(0);
+      return ctx->ac.i32_0;
+   }
  }
  
  LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
  
-       LLVMValueRef values[3];
-       LLVMValueRef result;
-       unsigned i;
-       unsigned *properties = ctx->shader->selector->info.properties;
+   LLVMValueRef values[3];
+   LLVMValueRef result;
+   unsigned i;
+   unsigned *properties = ctx->shader->selector->info.properties;
  
-       if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
-               unsigned sizes[3] = {
-                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
-                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
-                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
-               };
+   if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
+      unsigned sizes[3] = {properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
+                           properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
+                           properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]};
  
-               for (i = 0; i < 3; ++i)
-                       values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0);
+      for (i = 0; i < 3; ++i)
+         values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0);
  
-               result = ac_build_gather_values(&ctx->ac, values, 3);
-       } else {
-               result = ac_get_arg(&ctx->ac, ctx->block_size);
-       }
+      result = ac_build_gather_values(&ctx->ac, values, 3);
+   } else {
+      result = ac_get_arg(&ctx->ac, ctx->block_size);
+   }
  
-       return result;
+   return result;
  }
  
  void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
  {
-       struct si_shader_selector *sel = ctx->shader->selector;
-       unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
+   struct si_shader_selector *sel = ctx->shader->selector;
+   unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
  
-       LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
-       LLVMValueRef var;
+   LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
+   LLVMValueRef var;
  
-       assert(!ctx->ac.lds);
+   assert(!ctx->ac.lds);
  
-       var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
-                                         LLVMArrayType(ctx->ac.i8, lds_size),
-                                         "compute_lds",
-                                         AC_ADDR_SPACE_LDS);
-       LLVMSetAlignment(var, 64 * 1024);
+   var = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i8, lds_size),
+                                     "compute_lds", AC_ADDR_SPACE_LDS);
+   LLVMSetAlignment(var, 64 * 1024);
  
-       ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
+   ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
  }
  
  bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
  {
-       if (nir->info.stage == MESA_SHADER_VERTEX) {
-               si_llvm_load_vs_inputs(ctx, nir);
-       } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-                unsigned colors_read =
-                        ctx->shader->selector->info.colors_read;
-                LLVMValueRef main_fn = ctx->main_fn;
-
-                LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32);
-
-                unsigned offset = SI_PARAM_POS_FIXED_PT + 1;
-
-                if (colors_read & 0x0f) {
-                        unsigned mask = colors_read & 0x0f;
-                        LLVMValueRef values[4];
-                        values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
-                        values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
-                        values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
-                        values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
-                        ctx->abi.color0 =
-                                ac_to_integer(&ctx->ac,
-                                              ac_build_gather_values(&ctx->ac, values, 4));
-                }
-                if (colors_read & 0xf0) {
-                        unsigned mask = (colors_read & 0xf0) >> 4;
-                        LLVMValueRef values[4];
-                        values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
-                        values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
-                        values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
-                        values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
-                        ctx->abi.color1 =
-                                ac_to_integer(&ctx->ac,
-                                              ac_build_gather_values(&ctx->ac, values, 4));
-                }
-
-               ctx->abi.interp_at_sample_force_center =
-                       ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center;
-       } else if (nir->info.stage == MESA_SHADER_COMPUTE) {
-               if (nir->info.cs.user_data_components_amd) {
-                       ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data);
-                       ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data,
-                                                                    nir->info.cs.user_data_components_amd);
-               }
-       }
-
-       ctx->abi.inputs = &ctx->inputs[0];
-       ctx->abi.clamp_shadow_reference = true;
-       ctx->abi.robust_buffer_access = true;
-
-       if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) {
-               assert(gl_shader_stage_is_compute(nir->info.stage));
-               si_llvm_declare_compute_memory(ctx);
-       }
-       ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir);
-
-       return true;
+   if (nir->info.stage == MESA_SHADER_VERTEX) {
+      si_llvm_load_vs_inputs(ctx, nir);
+   } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      unsigned colors_read = ctx->shader->selector->info.colors_read;
+      LLVMValueRef main_fn = ctx->main_fn;
+
+      LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32);
+
+      unsigned offset = SI_PARAM_POS_FIXED_PT + 1;
+
+      if (colors_read & 0x0f) {
+         unsigned mask = colors_read & 0x0f;
+         LLVMValueRef values[4];
+         values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
+         values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
+         values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
+         values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
+         ctx->abi.color0 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4));
+      }
+      if (colors_read & 0xf0) {
+         unsigned mask = (colors_read & 0xf0) >> 4;
+         LLVMValueRef values[4];
+         values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
+         values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
+         values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
+         values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
+         ctx->abi.color1 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4));
+      }
+
+      ctx->abi.interp_at_sample_force_center =
+         ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center;
+   } else if (nir->info.stage == MESA_SHADER_COMPUTE) {
+      if (nir->info.cs.user_data_components_amd) {
+         ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data);
+         ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data,
+                                                      nir->info.cs.user_data_components_amd);
+      }
+   }
+
+   ctx->abi.inputs = &ctx->inputs[0];
+   ctx->abi.clamp_shadow_reference = true;
+   ctx->abi.robust_buffer_access = true;
+
+   if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) {
+      assert(gl_shader_stage_is_compute(nir->info.stage));
+      si_llvm_declare_compute_memory(ctx);
+   }
+   ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir);
+
+   return true;
  }
  
  /**
@@ -511,278 +467,270 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
   * runs them in sequence to form a monolithic shader.
   */
  void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
-                              unsigned num_parts, unsigned main_part,
-                              unsigned next_shader_first_part)
+                               unsigned num_parts, unsigned main_part,
+                               unsigned next_shader_first_part)
  {
-       LLVMBuilderRef builder = ctx->ac.builder;
-       /* PS epilog has one arg per color component; gfx9 merged shader
-        * prologs need to forward 40 SGPRs.
-        */
-       LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
-       LLVMTypeRef function_type;
-       unsigned num_first_params;
-       unsigned num_out, initial_num_out;
-       ASSERTED unsigned num_out_sgpr; /* used in debug checks */
-       ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
-       unsigned num_sgprs, num_vgprs;
-       unsigned gprs;
-
-       memset(&ctx->args, 0, sizeof(ctx->args));
-
-       for (unsigned i = 0; i < num_parts; ++i) {
-               ac_add_function_attr(ctx->ac.context, parts[i], -1,
-                                    AC_FUNC_ATTR_ALWAYSINLINE);
-               LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
-       }
-
-       /* The parameters of the wrapper function correspond to those of the
-        * first part in terms of SGPRs and VGPRs, but we use the types of the
-        * main part to get the right types. This is relevant for the
-        * dereferenceable attribute on descriptor table pointers.
-        */
-       num_sgprs = 0;
-       num_vgprs = 0;
-
-       function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
-       num_first_params = LLVMCountParamTypes(function_type);
-
-       for (unsigned i = 0; i < num_first_params; ++i) {
-               LLVMValueRef param = LLVMGetParam(parts[0], i);
-
-               if (ac_is_sgpr_param(param)) {
-                       assert(num_vgprs == 0);
-                       num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
-               } else {
-                       num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
-               }
-       }
-
-       gprs = 0;
-       while (gprs < num_sgprs + num_vgprs) {
-               LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
-               LLVMTypeRef type = LLVMTypeOf(param);
-               unsigned size = ac_get_type_size(type) / 4;
-
-               /* This is going to get casted anyways, so we don't have to
-                * have the exact same type. But we do have to preserve the
-                * pointer-ness so that LLVM knows about it.
-                */
-               enum ac_arg_type arg_type = AC_ARG_INT;
-               if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
-                       type = LLVMGetElementType(type);
-
-                       if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
-                               if (LLVMGetVectorSize(type) == 4)
-                                       arg_type = AC_ARG_CONST_DESC_PTR;
-                               else if (LLVMGetVectorSize(type) == 8)
-                                       arg_type = AC_ARG_CONST_IMAGE_PTR;
-                               else
-                                       assert(0);
-                       } else if (type == ctx->ac.f32) {
-                               arg_type = AC_ARG_CONST_FLOAT_PTR;
-                       } else {
-                               assert(0);
-                       }
-               }
-
-               ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR,
-                          size, arg_type, NULL);
-
-               assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
-               assert(gprs + size <= num_sgprs + num_vgprs &&
-                      (gprs >= num_sgprs || gprs + size <= num_sgprs));
-
-               gprs += size;
-       }
-
-       /* Prepare the return type. */
-       unsigned num_returns = 0;
-       LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
-
-       last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
-       return_type = LLVMGetReturnType(last_func_type);
-
-       switch (LLVMGetTypeKind(return_type)) {
-       case LLVMStructTypeKind:
-               num_returns = LLVMCountStructElementTypes(return_type);
-               assert(num_returns <= ARRAY_SIZE(returns));
-               LLVMGetStructElementTypes(return_type, returns);
-               break;
-       case LLVMVoidTypeKind:
-               break;
-       default:
-               unreachable("unexpected type");
-       }
-
-       si_llvm_create_func(ctx, "wrapper", returns, num_returns,
-                           si_get_max_workgroup_size(ctx->shader));
-
-       if (si_is_merged_shader(ctx->shader))
-               ac_init_exec_full_mask(&ctx->ac);
-
-       /* Record the arguments of the function as if they were an output of
-        * a previous part.
-        */
-       num_out = 0;
-       num_out_sgpr = 0;
-
-       for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
-               LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
-               LLVMTypeRef param_type = LLVMTypeOf(param);
-               LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32;
-               unsigned size = ac_get_type_size(param_type) / 4;
-
-               if (size == 1) {
-                       if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
-                               param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, "");
-                               param_type = ctx->ac.i32;
-                       }
-
-                       if (param_type != out_type)
-                               param = LLVMBuildBitCast(builder, param, out_type, "");
-                       out[num_out++] = param;
-               } else {
-                       LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
-
-                       if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
-                               param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, "");
-                               param_type = ctx->ac.i64;
-                       }
-
-                       if (param_type != vector_type)
-                               param = LLVMBuildBitCast(builder, param, vector_type, "");
-
-                       for (unsigned j = 0; j < size; ++j)
-                               out[num_out++] = LLVMBuildExtractElement(
-                                       builder, param, LLVMConstInt(ctx->ac.i32, j, 0), "");
-               }
-
-               if (ctx->args.args[i].file == AC_ARG_SGPR)
-                       num_out_sgpr = num_out;
-       }
-
-       memcpy(initial, out, sizeof(out));
-       initial_num_out = num_out;
-       initial_num_out_sgpr = num_out_sgpr;
-
-       /* Now chain the parts. */
-       LLVMValueRef ret = NULL;
-       for (unsigned part = 0; part < num_parts; ++part) {
-               LLVMValueRef in[AC_MAX_ARGS];
-               LLVMTypeRef ret_type;
-               unsigned out_idx = 0;
-               unsigned num_params = LLVMCountParams(parts[part]);
-
-               /* Merged shaders are executed conditionally depending
-                * on the number of enabled threads passed in the input SGPRs. */
-               if (si_is_multi_part_shader(ctx->shader) && part == 0) {
-                       LLVMValueRef ena, count = initial[3];
-
-                       count = LLVMBuildAnd(builder, count,
-                                            LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
-                       ena = LLVMBuildICmp(builder, LLVMIntULT,
-                                           ac_get_thread_id(&ctx->ac), count, "");
-                       ac_build_ifcc(&ctx->ac, ena, 6506);
-               }
-
-               /* Derive arguments for the next part from outputs of the
-                * previous one.
-                */
-               for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
-                       LLVMValueRef param;
-                       LLVMTypeRef param_type;
-                       bool is_sgpr;
-                       unsigned param_size;
-                       LLVMValueRef arg = NULL;
-
-                       param = LLVMGetParam(parts[part], param_idx);
-                       param_type = LLVMTypeOf(param);
-                       param_size = ac_get_type_size(param_type) / 4;
-                       is_sgpr = ac_is_sgpr_param(param);
-
-                       if (is_sgpr) {
-                               ac_add_function_attr(ctx->ac.context, parts[part],
-                                                    param_idx + 1, AC_FUNC_ATTR_INREG);
-                       } else if (out_idx < num_out_sgpr) {
-                               /* Skip returned SGPRs the current part doesn't
-                                * declare on the input. */
-                               out_idx = num_out_sgpr;
-                       }
-
-                       assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
-
-                       if (param_size == 1)
-                               arg = out[out_idx];
-                       else
-                               arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
-
-                       if (LLVMTypeOf(arg) != param_type) {
-                               if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
-                                       if (LLVMGetPointerAddressSpace(param_type) ==
-                                           AC_ADDR_SPACE_CONST_32BIT) {
-                                               arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, "");
-                                               arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
-                                       } else {
-                                               arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, "");
-                                               arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
-                                       }
-                               } else {
-                                       arg = LLVMBuildBitCast(builder, arg, param_type, "");
-                               }
-                       }
-
-                       in[param_idx] = arg;
-                       out_idx += param_size;
-               }
-
-               ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
-
-               if (si_is_multi_part_shader(ctx->shader) &&
-                   part + 1 == next_shader_first_part) {
-                       ac_build_endif(&ctx->ac, 6506);
-
-                       /* The second half of the merged shader should use
-                        * the inputs from the toplevel (wrapper) function,
-                        * not the return value from the last call.
-                        *
-                        * That's because the last call was executed condi-
-                        * tionally, so we can't consume it in the main
-                        * block.
-                        */
-                       memcpy(out, initial, sizeof(initial));
-                       num_out = initial_num_out;
-                       num_out_sgpr = initial_num_out_sgpr;
-                       continue;
-               }
-
-               /* Extract the returned GPRs. */
-               ret_type = LLVMTypeOf(ret);
-               num_out = 0;
-               num_out_sgpr = 0;
-
-               if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
-                       assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
-
-                       unsigned ret_size = LLVMCountStructElementTypes(ret_type);
-
-                       for (unsigned i = 0; i < ret_size; ++i) {
-                               LLVMValueRef val =
-                                       LLVMBuildExtractValue(builder, ret, i, "");
-
-                               assert(num_out < ARRAY_SIZE(out));
-                               out[num_out++] = val;
-
-                               if (LLVMTypeOf(val) == ctx->ac.i32) {
-                                       assert(num_out_sgpr + 1 == num_out);
-                                       num_out_sgpr = num_out;
-                               }
-                       }
-               }
-       }
-
-       /* Return the value from the last part. */
-       if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
-               LLVMBuildRetVoid(builder);
-       else
-               LLVMBuildRet(builder, ret);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   /* PS epilog has one arg per color component; gfx9 merged shader
+    * prologs need to forward 40 SGPRs.
+    */
+   LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
+   LLVMTypeRef function_type;
+   unsigned num_first_params;
+   unsigned num_out, initial_num_out;
+   ASSERTED unsigned num_out_sgpr;         /* used in debug checks */
+   ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
+   unsigned num_sgprs, num_vgprs;
+   unsigned gprs;
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   for (unsigned i = 0; i < num_parts; ++i) {
+      ac_add_function_attr(ctx->ac.context, parts[i], -1, AC_FUNC_ATTR_ALWAYSINLINE);
+      LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
+   }
+
+   /* The parameters of the wrapper function correspond to those of the
+    * first part in terms of SGPRs and VGPRs, but we use the types of the
+    * main part to get the right types. This is relevant for the
+    * dereferenceable attribute on descriptor table pointers.
+    */
+   num_sgprs = 0;
+   num_vgprs = 0;
+
+   function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
+   num_first_params = LLVMCountParamTypes(function_type);
+
+   for (unsigned i = 0; i < num_first_params; ++i) {
+      LLVMValueRef param = LLVMGetParam(parts[0], i);
+
+      if (ac_is_sgpr_param(param)) {
+         assert(num_vgprs == 0);
+         num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
+      } else {
+         num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
+      }
+   }
+
+   gprs = 0;
+   while (gprs < num_sgprs + num_vgprs) {
+      LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
+      LLVMTypeRef type = LLVMTypeOf(param);
+      unsigned size = ac_get_type_size(type) / 4;
+
+      /* This is going to get casted anyways, so we don't have to
+       * have the exact same type. But we do have to preserve the
+       * pointer-ness so that LLVM knows about it.
+       */
+      enum ac_arg_type arg_type = AC_ARG_INT;
+      if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
+         type = LLVMGetElementType(type);
+
+         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
+            if (LLVMGetVectorSize(type) == 4)
+               arg_type = AC_ARG_CONST_DESC_PTR;
+            else if (LLVMGetVectorSize(type) == 8)
+               arg_type = AC_ARG_CONST_IMAGE_PTR;
+            else
+               assert(0);
+         } else if (type == ctx->ac.f32) {
+            arg_type = AC_ARG_CONST_FLOAT_PTR;
+         } else {
+            assert(0);
+         }
+      }
+
+      ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR, size, arg_type, NULL);
+
+      assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
+      assert(gprs + size <= num_sgprs + num_vgprs &&
+             (gprs >= num_sgprs || gprs + size <= num_sgprs));
+
+      gprs += size;
+   }
+
+   /* Prepare the return type. */
+   unsigned num_returns = 0;
+   LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
+
+   last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
+   return_type = LLVMGetReturnType(last_func_type);
+
+   switch (LLVMGetTypeKind(return_type)) {
+   case LLVMStructTypeKind:
+      num_returns = LLVMCountStructElementTypes(return_type);
+      assert(num_returns <= ARRAY_SIZE(returns));
+      LLVMGetStructElementTypes(return_type, returns);
+      break;
+   case LLVMVoidTypeKind:
+      break;
+   default:
+      unreachable("unexpected type");
+   }
+
+   si_llvm_create_func(ctx, "wrapper", returns, num_returns,
+                       si_get_max_workgroup_size(ctx->shader));
+
+   if (si_is_merged_shader(ctx->shader))
+      ac_init_exec_full_mask(&ctx->ac);
+
+   /* Record the arguments of the function as if they were an output of
+    * a previous part.
+    */
+   num_out = 0;
+   num_out_sgpr = 0;
+
+   for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
+      LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
+      LLVMTypeRef param_type = LLVMTypeOf(param);
+      LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32;
+      unsigned size = ac_get_type_size(param_type) / 4;
+
+      if (size == 1) {
+         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
+            param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, "");
+            param_type = ctx->ac.i32;
+         }
+
+         if (param_type != out_type)
+            param = LLVMBuildBitCast(builder, param, out_type, "");
+         out[num_out++] = param;
+      } else {
+         LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
+
+         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
+            param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, "");
+            param_type = ctx->ac.i64;
+         }
+
+         if (param_type != vector_type)
+            param = LLVMBuildBitCast(builder, param, vector_type, "");
+
+         for (unsigned j = 0; j < size; ++j)
+            out[num_out++] =
+               LLVMBuildExtractElement(builder, param, LLVMConstInt(ctx->ac.i32, j, 0), "");
+      }
+
+      if (ctx->args.args[i].file == AC_ARG_SGPR)
+         num_out_sgpr = num_out;
+   }
+
+   memcpy(initial, out, sizeof(out));
+   initial_num_out = num_out;
+   initial_num_out_sgpr = num_out_sgpr;
+
+   /* Now chain the parts. */
+   LLVMValueRef ret = NULL;
+   for (unsigned part = 0; part < num_parts; ++part) {
+      LLVMValueRef in[AC_MAX_ARGS];
+      LLVMTypeRef ret_type;
+      unsigned out_idx = 0;
+      unsigned num_params = LLVMCountParams(parts[part]);
+
+      /* Merged shaders are executed conditionally depending
+       * on the number of enabled threads passed in the input SGPRs. */
+      if (si_is_multi_part_shader(ctx->shader) && part == 0) {
+         LLVMValueRef ena, count = initial[3];
+
+         count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
+         ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
+         ac_build_ifcc(&ctx->ac, ena, 6506);
+      }
+
+      /* Derive arguments for the next part from outputs of the
+       * previous one.
+       */
+      for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
+         LLVMValueRef param;
+         LLVMTypeRef param_type;
+         bool is_sgpr;
+         unsigned param_size;
+         LLVMValueRef arg = NULL;
+
+         param = LLVMGetParam(parts[part], param_idx);
+         param_type = LLVMTypeOf(param);
+         param_size = ac_get_type_size(param_type) / 4;
+         is_sgpr = ac_is_sgpr_param(param);
+
+         if (is_sgpr) {
+            ac_add_function_attr(ctx->ac.context, parts[part], param_idx + 1, AC_FUNC_ATTR_INREG);
+         } else if (out_idx < num_out_sgpr) {
+            /* Skip returned SGPRs the current part doesn't
+             * declare on the input. */
+            out_idx = num_out_sgpr;
+         }
+
+         assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
+
+         if (param_size == 1)
+            arg = out[out_idx];
+         else
+            arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
+
+         if (LLVMTypeOf(arg) != param_type) {
+            if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
+               if (LLVMGetPointerAddressSpace(param_type) == AC_ADDR_SPACE_CONST_32BIT) {
+                  arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, "");
+                  arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
+               } else {
+                  arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, "");
+                  arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
+               }
+            } else {
+               arg = LLVMBuildBitCast(builder, arg, param_type, "");
+            }
+         }
+
+         in[param_idx] = arg;
+         out_idx += param_size;
+      }
+
+      ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
+
+      if (si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) {
+         ac_build_endif(&ctx->ac, 6506);
+
+         /* The second half of the merged shader should use
+          * the inputs from the toplevel (wrapper) function,
+          * not the return value from the last call.
+          *
+          * That's because the last call was executed condi-
+          * tionally, so we can't consume it in the main
+          * block.
+          */
+         memcpy(out, initial, sizeof(initial));
+         num_out = initial_num_out;
+         num_out_sgpr = initial_num_out_sgpr;
+         continue;
+      }
+
+      /* Extract the returned GPRs. */
+      ret_type = LLVMTypeOf(ret);
+      num_out = 0;
+      num_out_sgpr = 0;
+
+      if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
+         assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
+
+         unsigned ret_size = LLVMCountStructElementTypes(ret_type);
+
+         for (unsigned i = 0; i < ret_size; ++i) {
+            LLVMValueRef val = LLVMBuildExtractValue(builder, ret, i, "");
+
+            assert(num_out < ARRAY_SIZE(out));
+            out[num_out++] = val;
+
+            if (LLVMTypeOf(val) == ctx->ac.i32) {
+               assert(num_out_sgpr + 1 == num_out);
+               num_out_sgpr = num_out;
+            }
+         }
+      }
+   }
+
+   /* Return the value from the last part. */
+   if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
+      LLVMBuildRetVoid(builder);
+   else
+      LLVMBuildRet(builder, ret);
  }
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c

index 99ffdd2e980a140d7ea9698579c42bb944235cb8..2a609572d841c1c17b75edb5e68f7efee2460beb 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
@@ -22,759 +22,693 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "si_shader_internal.h"
  #include "si_pipe.h"
+#include "si_shader_internal.h"
  #include "sid.h"
  #include "util/u_memory.h"
  
  LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
  {
-       /* Return true if the current thread should execute an ES thread. */
-       return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
-                            ac_get_thread_id(&ctx->ac),
-                            si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
+   /* Return true if the current thread should execute an ES thread. */
+   return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
+                        si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
  }
  
  LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
  {
-       /* Return true if the current thread should execute a GS thread. */
-       return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
-                            ac_get_thread_id(&ctx->ac),
-                            si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
+   /* Return true if the current thread should execute a GS thread. */
+   return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
+                        si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
  }
  
-static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
-                                         unsigned input_index,
-                                         unsigned vtx_offset_param,
-                                         LLVMTypeRef type,
-                                         unsigned swizzle)
+static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned input_index,
+                                          unsigned vtx_offset_param, LLVMTypeRef type,
+                                          unsigned swizzle)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader *shader = ctx->shader;
-       LLVMValueRef vtx_offset, soffset;
-       struct si_shader_info *info = &shader->selector->info;
-       unsigned semantic_name = info->input_semantic_name[input_index];
-       unsigned semantic_index = info->input_semantic_index[input_index];
-       unsigned param;
-       LLVMValueRef value;
-
-       param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
-
-       /* GFX9 has the ESGS ring in LDS. */
-       if (ctx->screen->info.chip_class >= GFX9) {
-               unsigned index = vtx_offset_param;
-
-               switch (index / 2) {
-               case 0:
-                       vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset,
-                                                    index % 2 ? 16 : 0, 16);
-                       break;
-               case 1:
-                       vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset,
-                                                    index % 2 ? 16 : 0, 16);
-                       break;
-               case 2:
-                       vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset,
-                                                    index % 2 ? 16 : 0, 16);
-                       break;
-               default:
-                       assert(0);
-                       return NULL;
-               }
-
-               unsigned offset = param * 4 + swizzle;
-               vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
-                                         LLVMConstInt(ctx->ac.i32, offset, false), "");
-
-               LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
-               LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
-               if (ac_get_type_size(type) == 8) {
-                       ptr = LLVMBuildGEP(ctx->ac.builder, ptr,
-                                          &ctx->ac.i32_1, 1, "");
-                       LLVMValueRef values[2] = {
-                               value,
-                               LLVMBuildLoad(ctx->ac.builder, ptr, "")
-                       };
-                       value = ac_build_gather_values(&ctx->ac, values, 2);
-               }
-               return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
-       }
-
-       /* GFX6: input load from the ESGS ring in memory. */
-       if (swizzle == ~0) {
-               LLVMValueRef values[4];
-               unsigned chan;
-               for (chan = 0; chan < 4; chan++) {
-                       values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
-                                                            type, chan);
-               }
-               return ac_build_gather_values(&ctx->ac, values, 4);
-       }
-
-       /* Get the vertex offset parameter on GFX6. */
-       LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac,
-                                               ctx->gs_vtx_offset[vtx_offset_param]);
-
-       vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset,
-                                 LLVMConstInt(ctx->ac.i32, 4, 0), "");
-
-       soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
-
-       value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0,
-                                    vtx_offset, soffset, 0, ac_glc, true, false);
-       if (ac_get_type_size(type) == 8) {
-               LLVMValueRef value2;
-               soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0);
-
-               value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
-                                             ctx->ac.i32_0, vtx_offset, soffset,
-                                             0, ac_glc, true, false);
-               return si_build_gather_64bit(ctx, type, value, value2);
-       }
-       return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader *shader = ctx->shader;
+   LLVMValueRef vtx_offset, soffset;
+   struct si_shader_info *info = &shader->selector->info;
+   unsigned semantic_name = info->input_semantic_name[input_index];
+   unsigned semantic_index = info->input_semantic_index[input_index];
+   unsigned param;
+   LLVMValueRef value;
+
+   param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
+
+   /* GFX9 has the ESGS ring in LDS. */
+   if (ctx->screen->info.chip_class >= GFX9) {
+      unsigned index = vtx_offset_param;
+
+      switch (index / 2) {
+      case 0:
+         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, index % 2 ? 16 : 0, 16);
+         break;
+      case 1:
+         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, index % 2 ? 16 : 0, 16);
+         break;
+      case 2:
+         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, index % 2 ? 16 : 0, 16);
+         break;
+      default:
+         assert(0);
+         return NULL;
+      }
+
+      unsigned offset = param * 4 + swizzle;
+      vtx_offset =
+         LLVMBuildAdd(ctx->ac.builder, vtx_offset, LLVMConstInt(ctx->ac.i32, offset, false), "");
+
+      LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
+      LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
+      if (ac_get_type_size(type) == 8) {
+         ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &ctx->ac.i32_1, 1, "");
+         LLVMValueRef values[2] = {value, LLVMBuildLoad(ctx->ac.builder, ptr, "")};
+         value = ac_build_gather_values(&ctx->ac, values, 2);
+      }
+      return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
+   }
+
+   /* GFX6: input load from the ESGS ring in memory. */
+   if (swizzle == ~0) {
+      LLVMValueRef values[4];
+      unsigned chan;
+      for (chan = 0; chan < 4; chan++) {
+         values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, type, chan);
+      }
+      return ac_build_gather_values(&ctx->ac, values, 4);
+   }
+
+   /* Get the vertex offset parameter on GFX6. */
+   LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, ctx->gs_vtx_offset[vtx_offset_param]);
+
+   vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+   soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
+
+   value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, 0,
+                                ac_glc, true, false);
+   if (ac_get_type_size(type) == 8) {
+      LLVMValueRef value2;
+      soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0);
+
+      value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset,
+                                    0, ac_glc, true, false);
+      return si_build_gather_64bit(ctx, type, value, value2);
+   }
+   return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
  }
  
-static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
-                                        unsigned location,
-                                        unsigned driver_location,
-                                        unsigned component,
-                                        unsigned num_components,
-                                        unsigned vertex_index,
-                                        unsigned const_index,
-                                        LLVMTypeRef type)
+static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, unsigned location,
+                                         unsigned driver_location, unsigned component,
+                                         unsigned num_components, unsigned vertex_index,
+                                         unsigned const_index, LLVMTypeRef type)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
  
-       LLVMValueRef value[4];
-       for (unsigned i = 0; i < num_components; i++) {
-               unsigned offset = i;
-               if (ac_get_type_size(type) == 8)
-                       offset *= 2;
+   LLVMValueRef value[4];
+   for (unsigned i = 0; i < num_components; i++) {
+      unsigned offset = i;
+      if (ac_get_type_size(type) == 8)
+         offset *= 2;
  
-               offset += component;
-               value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location  / 4 + const_index,
-                                                            vertex_index, type, offset);
-       }
+      offset += component;
+      value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index,
+                                                   vertex_index, type, offset);
+   }
  
-       return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
+   return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
  }
  
  /* Pass GS inputs from ES to GS on GFX9. */
  static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
  {
-       LLVMValueRef ret = ctx->return_value;
-
-       ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
-       ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
-       if (ctx->shader->key.as_ngg)
-               ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
-       else
-               ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
-       ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
-       ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
-
-       ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
-                                 8 + SI_SGPR_RW_BUFFERS);
-       ret = si_insert_input_ptr(ctx, ret,
-                                 ctx->bindless_samplers_and_images,
-                                 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
-       if (ctx->screen->use_ngg) {
-               ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits,
-                                         8 + SI_SGPR_VS_STATE_BITS);
-       }
-
-       unsigned vgpr;
-       if (ctx->type == PIPE_SHADER_VERTEX)
-               vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
-       else
-               vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
-
-       ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
-       ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
-       ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
-       ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
-       ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
-       ctx->return_value = ret;
+   LLVMValueRef ret = ctx->return_value;
+
+   ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
+   ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
+   if (ctx->shader->key.as_ngg)
+      ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
+   else
+      ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
+   ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
+   ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
+
+   ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
+   ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
+                             8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
+   if (ctx->screen->use_ngg) {
+      ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
+   }
+
+   unsigned vgpr;
+   if (ctx->type == PIPE_SHADER_VERTEX)
+      vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
+   else
+      vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
+
+   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
+   ctx->return_value = ret;
  }
  
-void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-                             LLVMValueRef *addrs)
+void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader *es = ctx->shader;
-       struct si_shader_info *info = &es->selector->info;
-       LLVMValueRef lds_base = NULL;
-       unsigned chan;
-       int i;
-
-       if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
-               unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
-               LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
-               LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
-               vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
-                                        LLVMBuildMul(ctx->ac.builder, wave_idx,
-                                                     LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""), "");
-               lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
-                                       LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
-       }
-
-       for (i = 0; i < info->num_outputs; i++) {
-               int param;
-
-               if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
-                   info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
-                       continue;
-
-               param = si_shader_io_get_unique_index(info->output_semantic_name[i],
-                                                     info->output_semantic_index[i], false);
-
-               for (chan = 0; chan < 4; chan++) {
-                       if (!(info->output_usagemask[i] & (1 << chan)))
-                               continue;
-
-                       LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
-                       out_val = ac_to_integer(&ctx->ac, out_val);
-
-                       /* GFX9 has the ESGS ring in LDS. */
-                       if (ctx->screen->info.chip_class >= GFX9) {
-                               LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
-                               idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
-                               ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
-                               continue;
-                       }
-
-                       ac_build_buffer_store_dword(&ctx->ac,
-                                                   ctx->esgs_ring,
-                                                   out_val, 1, NULL,
-                                                   ac_get_arg(&ctx->ac, ctx->es2gs_offset),
-                                                   (4 * param + chan) * 4,
-                                                   ac_glc | ac_slc | ac_swizzled);
-               }
-       }
-
-       if (ctx->screen->info.chip_class >= GFX9)
-               si_set_es_return_value_for_gs(ctx);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader *es = ctx->shader;
+   struct si_shader_info *info = &es->selector->info;
+   LLVMValueRef lds_base = NULL;
+   unsigned chan;
+   int i;
+
+   if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
+      unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
+      LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
+      LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
+      vertex_idx =
+         LLVMBuildOr(ctx->ac.builder, vertex_idx,
+                     LLVMBuildMul(ctx->ac.builder, wave_idx,
+                                  LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""),
+                     "");
+      lds_base =
+         LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
+   }
+
+   for (i = 0; i < info->num_outputs; i++) {
+      int param;
+
+      if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
+          info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
+         continue;
+
+      param = si_shader_io_get_unique_index(info->output_semantic_name[i],
+                                            info->output_semantic_index[i], false);
+
+      for (chan = 0; chan < 4; chan++) {
+         if (!(info->output_usagemask[i] & (1 << chan)))
+            continue;
+
+         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+         out_val = ac_to_integer(&ctx->ac, out_val);
+
+         /* GFX9 has the ESGS ring in LDS. */
+         if (ctx->screen->info.chip_class >= GFX9) {
+            LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
+            idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
+            ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
+            continue;
+         }
+
+         ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL,
+                                     ac_get_arg(&ctx->ac, ctx->es2gs_offset),
+                                     (4 * param + chan) * 4, ac_glc | ac_slc | ac_swizzled);
+      }
+   }
+
+   if (ctx->screen->info.chip_class >= GFX9)
+      si_set_es_return_value_for_gs(ctx);
  }
  
  static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
  {
-       if (ctx->screen->info.chip_class >= GFX9)
-               return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
-       else
-               return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
+   if (ctx->screen->info.chip_class >= GFX9)
+      return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
+   else
+      return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
  }
  
  static void emit_gs_epilogue(struct si_shader_context *ctx)
  {
-       if (ctx->shader->key.as_ngg) {
-               gfx10_ngg_gs_emit_epilogue(ctx);
-               return;
-       }
+   if (ctx->shader->key.as_ngg) {
+      gfx10_ngg_gs_emit_epilogue(ctx);
+      return;
+   }
  
-       if (ctx->screen->info.chip_class >= GFX10)
-               LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
+   if (ctx->screen->info.chip_class >= GFX10)
+      LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
  
-       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
-                        si_get_gs_wave_id(ctx));
+   ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx));
  
-       if (ctx->screen->info.chip_class >= GFX9)
-               ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+   if (ctx->screen->info.chip_class >= GFX9)
+      ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
  }
  
-static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
-                                    unsigned max_outputs,
-                                    LLVMValueRef *addrs)
+static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+                                     LLVMValueRef *addrs)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
  
-       assert(info->num_outputs <= max_outputs);
+   assert(info->num_outputs <= max_outputs);
  
-       emit_gs_epilogue(ctx);
+   emit_gs_epilogue(ctx);
  }
  
  /* Emit one vertex from the geometry shader */
-static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
-                               unsigned stream,
-                               LLVMValueRef *addrs)
+static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-
-       if (ctx->shader->key.as_ngg) {
-               gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
-               return;
-       }
-
-       struct si_shader_info *info = &ctx->shader->selector->info;
-       struct si_shader *shader = ctx->shader;
-       LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
-       LLVMValueRef gs_next_vertex;
-       LLVMValueRef can_emit;
-       unsigned chan, offset;
-       int i;
-
-       /* Write vertex attribute values to GSVS ring */
-       gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
-                                      ctx->gs_next_vertex[stream],
-                                      "");
-
-       /* If this thread has already emitted the declared maximum number of
-        * vertices, skip the write: excessive vertex emissions are not
-        * supposed to have any effect.
-        *
-        * If the shader has no writes to memory, kill it instead. This skips
-        * further memory loads and may allow LLVM to skip to the end
-        * altogether.
-        */
-       can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
-                                LLVMConstInt(ctx->ac.i32,
-                                             shader->selector->gs_max_out_vertices, 0), "");
-
-       bool use_kill = !info->writes_memory;
-       if (use_kill) {
-               ac_build_kill_if_false(&ctx->ac, can_emit);
-       } else {
-               ac_build_ifcc(&ctx->ac, can_emit, 6505);
-       }
-
-       offset = 0;
-       for (i = 0; i < info->num_outputs; i++) {
-               for (chan = 0; chan < 4; chan++) {
-                       if (!(info->output_usagemask[i] & (1 << chan)) ||
-                           ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
-                               continue;
-
-                       LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
-                       LLVMValueRef voffset =
-                               LLVMConstInt(ctx->ac.i32, offset *
-                                            shader->selector->gs_max_out_vertices, 0);
-                       offset++;
-
-                       voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
-                       voffset = LLVMBuildMul(ctx->ac.builder, voffset,
-                                              LLVMConstInt(ctx->ac.i32, 4, 0), "");
-
-                       out_val = ac_to_integer(&ctx->ac, out_val);
-
-                       ac_build_buffer_store_dword(&ctx->ac,
-                                                   ctx->gsvs_ring[stream],
-                                                   out_val, 1,
-                                                   voffset, soffset, 0,
-                                                   ac_glc | ac_slc | ac_swizzled);
-               }
-       }
-
-       gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
-       LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
-
-       /* Signal vertex emission if vertex data was written. */
-       if (offset) {
-               ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
-                                si_get_gs_wave_id(ctx));
-       }
-
-       if (!use_kill)
-               ac_build_endif(&ctx->ac, 6505);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+
+   if (ctx->shader->key.as_ngg) {
+      gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
+      return;
+   }
+
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   struct si_shader *shader = ctx->shader;
+   LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
+   LLVMValueRef gs_next_vertex;
+   LLVMValueRef can_emit;
+   unsigned chan, offset;
+   int i;
+
+   /* Write vertex attribute values to GSVS ring */
+   gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], "");
+
+   /* If this thread has already emitted the declared maximum number of
+    * vertices, skip the write: excessive vertex emissions are not
+    * supposed to have any effect.
+    *
+    * If the shader has no writes to memory, kill it instead. This skips
+    * further memory loads and may allow LLVM to skip to the end
+    * altogether.
+    */
+   can_emit =
+      LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
+                    LLVMConstInt(ctx->ac.i32, shader->selector->gs_max_out_vertices, 0), "");
+
+   bool use_kill = !info->writes_memory;
+   if (use_kill) {
+      ac_build_kill_if_false(&ctx->ac, can_emit);
+   } else {
+      ac_build_ifcc(&ctx->ac, can_emit, 6505);
+   }
+
+   offset = 0;
+   for (i = 0; i < info->num_outputs; i++) {
+      for (chan = 0; chan < 4; chan++) {
+         if (!(info->output_usagemask[i] & (1 << chan)) ||
+             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
+            continue;
+
+         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+         LLVMValueRef voffset =
+            LLVMConstInt(ctx->ac.i32, offset * shader->selector->gs_max_out_vertices, 0);
+         offset++;
+
+         voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
+         voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+         out_val = ac_to_integer(&ctx->ac, out_val);
+
+         ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset, soffset,
+                                     0, ac_glc | ac_slc | ac_swizzled);
+      }
+   }
+
+   gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
+   LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
+
+   /* Signal vertex emission if vertex data was written. */
+   if (offset) {
+      ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
+                       si_get_gs_wave_id(ctx));
+   }
+
+   if (!use_kill)
+      ac_build_endif(&ctx->ac, 6505);
  }
  
  /* Cut one primitive from the geometry shader */
-static void si_llvm_emit_primitive(struct ac_shader_abi *abi,
-                                  unsigned stream)
+static void si_llvm_emit_primitive(struct ac_shader_abi *abi, unsigned stream)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
  
-       if (ctx->shader->key.as_ngg) {
-               LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
-               return;
-       }
+   if (ctx->shader->key.as_ngg) {
+      LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
+      return;
+   }
  
-       /* Signal primitive cut */
-       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
-                        si_get_gs_wave_id(ctx));
+   /* Signal primitive cut */
+   ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
+                    si_get_gs_wave_id(ctx));
  }
  
  void si_preload_esgs_ring(struct si_shader_context *ctx)
  {
-       if (ctx->screen->info.chip_class <= GFX8) {
-               unsigned ring =
-                       ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
-                                                         : SI_ES_RING_ESGS;
-               LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
-               LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-
-               ctx->esgs_ring =
-                       ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-       } else {
-               if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
-                       /* Declare the ESGS ring as an explicit LDS symbol. */
-                       si_llvm_declare_esgs_ring(ctx);
-               } else {
-                       ac_declare_lds_as_pointer(&ctx->ac);
-                       ctx->esgs_ring = ctx->ac.lds;
-               }
-       }
+   if (ctx->screen->info.chip_class <= GFX8) {
+      unsigned ring = ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS : SI_ES_RING_ESGS;
+      LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
+      LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+
+      ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
+   } else {
+      if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
+         /* Declare the ESGS ring as an explicit LDS symbol. */
+         si_llvm_declare_esgs_ring(ctx);
+      } else {
+         ac_declare_lds_as_pointer(&ctx->ac);
+         ctx->esgs_ring = ctx->ac.lds;
+      }
+   }
  }
  
  void si_preload_gs_rings(struct si_shader_context *ctx)
  {
-       const struct si_shader_selector *sel = ctx->shader->selector;
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
-       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-       LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-
-       /* The conceptual layout of the GSVS ring is
-        *   v0c0 .. vLv0 v0c1 .. vLc1 ..
-        * but the real memory layout is swizzled across
-        * threads:
-        *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
-        *   t16v0c0 ..
-        * Override the buffer descriptor accordingly.
-        */
-       LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
-       uint64_t stream_offset = 0;
-
-       for (unsigned stream = 0; stream < 4; ++stream) {
-               unsigned num_components;
-               unsigned stride;
-               unsigned num_records;
-               LLVMValueRef ring, tmp;
-
-               num_components = sel->info.num_stream_output_components[stream];
-               if (!num_components)
-                       continue;
-
-               stride = 4 * num_components * sel->gs_max_out_vertices;
-
-               /* Limit on the stride field for <= GFX7. */
-               assert(stride < (1 << 14));
-
-               num_records = ctx->ac.wave_size;
-
-               ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
-               tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
-               tmp = LLVMBuildAdd(builder, tmp,
-                                  LLVMConstInt(ctx->ac.i64,
-                                               stream_offset, 0), "");
-               stream_offset += stride * ctx->ac.wave_size;
-
-               ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
-               ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
-               tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
-               tmp = LLVMBuildOr(builder, tmp,
-                       LLVMConstInt(ctx->ac.i32,
-                                    S_008F04_STRIDE(stride) |
-                                    S_008F04_SWIZZLE_ENABLE(1), 0), "");
-               ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
-               ring = LLVMBuildInsertElement(builder, ring,
-                               LLVMConstInt(ctx->ac.i32, num_records, 0),
-                               LLVMConstInt(ctx->ac.i32, 2, 0), "");
-
-               uint32_t rsrc3 =
-                               S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
-                               S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
-                               S_008F0C_ADD_TID_ENABLE(1);
-
-               if (ctx->ac.chip_class >= GFX10) {
-                       rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
-                                S_008F0C_RESOURCE_LEVEL(1);
-               } else {
-                       rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
-                                S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
-               }
-
-               ring = LLVMBuildInsertElement(builder, ring,
-                       LLVMConstInt(ctx->ac.i32, rsrc3, false),
-                       LLVMConstInt(ctx->ac.i32, 3, 0), "");
-
-               ctx->gsvs_ring[stream] = ring;
-       }
+   const struct si_shader_selector *sel = ctx->shader->selector;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
+   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
+
+   /* The conceptual layout of the GSVS ring is
+    *   v0c0 .. vLv0 v0c1 .. vLc1 ..
+    * but the real memory layout is swizzled across
+    * threads:
+    *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
+    *   t16v0c0 ..
+    * Override the buffer descriptor accordingly.
+    */
+   LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
+   uint64_t stream_offset = 0;
+
+   for (unsigned stream = 0; stream < 4; ++stream) {
+      unsigned num_components;
+      unsigned stride;
+      unsigned num_records;
+      LLVMValueRef ring, tmp;
+
+      num_components = sel->info.num_stream_output_components[stream];
+      if (!num_components)
+         continue;
+
+      stride = 4 * num_components * sel->gs_max_out_vertices;
+
+      /* Limit on the stride field for <= GFX7. */
+      assert(stride < (1 << 14));
+
+      num_records = ctx->ac.wave_size;
+
+      ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
+      tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
+      tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
+      stream_offset += stride * ctx->ac.wave_size;
+
+      ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
+      ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
+      tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
+      tmp = LLVMBuildOr(
+         builder, tmp,
+         LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), "");
+      ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
+      ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, num_records, 0),
+                                    LLVMConstInt(ctx->ac.i32, 2, 0), "");
+
+      uint32_t rsrc3 =
+         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+         S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
+         S_008F0C_ADD_TID_ENABLE(1);
+
+      if (ctx->ac.chip_class >= GFX10) {
+         rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                  S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
+      } else {
+         rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+                  S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
+      }
+
+      ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, rsrc3, false),
+                                    LLVMConstInt(ctx->ac.i32, 3, 0), "");
+
+      ctx->gsvs_ring[stream] = ring;
+   }
  }
  
  /* Generate code for the hardware VS shader stage to go with a geometry shader */
-struct si_shader *
-si_generate_gs_copy_shader(struct si_screen *sscreen,
-                          struct ac_llvm_compiler *compiler,
-                          struct si_shader_selector *gs_selector,
-                          struct pipe_debug_callback *debug)
+struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
+                                             struct ac_llvm_compiler *compiler,
+                                             struct si_shader_selector *gs_selector,
+                                             struct pipe_debug_callback *debug)
  {
-       struct si_shader_context ctx;
-       struct si_shader *shader;
-       LLVMBuilderRef builder;
-       struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
-       struct si_shader_info *gsinfo = &gs_selector->info;
-       int i;
-
-
-       shader = CALLOC_STRUCT(si_shader);
-       if (!shader)
-               return NULL;
-
-       /* We can leave the fence as permanently signaled because the GS copy
-        * shader only becomes visible globally after it has been compiled. */
-       util_queue_fence_init(&shader->ready);
-
-       shader->selector = gs_selector;
-       shader->is_gs_copy_shader = true;
+   struct si_shader_context ctx;
+   struct si_shader *shader;
+   LLVMBuilderRef builder;
+   struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
+   struct si_shader_info *gsinfo = &gs_selector->info;
+   int i;
+
+   shader = CALLOC_STRUCT(si_shader);
+   if (!shader)
+      return NULL;
+
+   /* We can leave the fence as permanently signaled because the GS copy
+    * shader only becomes visible globally after it has been compiled. */
+   util_queue_fence_init(&shader->ready);
+
+   shader->selector = gs_selector;
+   shader->is_gs_copy_shader = true;
+
+   si_llvm_context_init(&ctx, sscreen, compiler,
+                        si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false, false));
+   ctx.shader = shader;
+   ctx.type = PIPE_SHADER_VERTEX;
+
+   builder = ctx.ac.builder;
+
+   si_create_function(&ctx, false);
+
+   LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
+   ctx.gsvs_ring[0] =
+      ac_build_load_to_sgpr(&ctx.ac, buf_ptr, LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
+
+   LLVMValueRef voffset =
+      LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, LLVMConstInt(ctx.ac.i32, 4, 0), "");
+
+   /* Fetch the vertex stream ID.*/
+   LLVMValueRef stream_id;
+
+   if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
+      stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
+   else
+      stream_id = ctx.ac.i32_0;
+
+   /* Fill in output information. */
+   for (i = 0; i < gsinfo->num_outputs; ++i) {
+      outputs[i].semantic_name = gsinfo->output_semantic_name[i];
+      outputs[i].semantic_index = gsinfo->output_semantic_index[i];
+
+      for (int chan = 0; chan < 4; chan++) {
+         outputs[i].vertex_stream[chan] = (gsinfo->output_streams[i] >> (2 * chan)) & 3;
+      }
+   }
+
+   LLVMBasicBlockRef end_bb;
+   LLVMValueRef switch_inst;
+
+   end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
+   switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
+
+   for (int stream = 0; stream < 4; stream++) {
+      LLVMBasicBlockRef bb;
+      unsigned offset;
+
+      if (!gsinfo->num_stream_output_components[stream])
+         continue;
+
+      if (stream > 0 && !gs_selector->so.num_outputs)
+         continue;
+
+      bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
+      LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
+      LLVMPositionBuilderAtEnd(builder, bb);
+
+      /* Fetch vertex data from GSVS ring */
+      offset = 0;
+      for (i = 0; i < gsinfo->num_outputs; ++i) {
+         for (unsigned chan = 0; chan < 4; chan++) {
+            if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
+                outputs[i].vertex_stream[chan] != stream) {
+               outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
+               continue;
+            }
+
+            LLVMValueRef soffset =
+               LLVMConstInt(ctx.ac.i32, offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
+            offset++;
+
+            outputs[i].values[chan] =
+               ac_build_buffer_load(&ctx.ac, ctx.gsvs_ring[0], 1, ctx.ac.i32_0, voffset, soffset, 0,
+                                    ac_glc | ac_slc, true, false);
+         }
+      }
+
+      /* Streamout and exports. */
+      if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
+         si_llvm_emit_streamout(&ctx, outputs, gsinfo->num_outputs, stream);
+      }
+
+      if (stream == 0)
+         si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
+
+      LLVMBuildBr(builder, end_bb);
+   }
+
+   LLVMPositionBuilderAtEnd(builder, end_bb);
+
+   LLVMBuildRetVoid(ctx.ac.builder);
+
+   ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
+   si_llvm_optimize_module(&ctx);
+
+   bool ok = false;
+   if (si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.compiler, &ctx.ac,
+                       debug, PIPE_SHADER_GEOMETRY, "GS Copy Shader", false)) {
+      if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
+         fprintf(stderr, "GS Copy Shader:\n");
+      si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
+
+      if (!ctx.shader->config.scratch_bytes_per_wave)
+         ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
+      else
+         ok = true;
+   }
  
-       si_llvm_context_init(&ctx, sscreen, compiler,
-                            si_get_wave_size(sscreen, PIPE_SHADER_VERTEX,
-                                             false, false, false));
-       ctx.shader = shader;
-       ctx.type = PIPE_SHADER_VERTEX;
-
-       builder = ctx.ac.builder;
-
-       si_create_function(&ctx, false);
-
-       LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
-       ctx.gsvs_ring[0] = ac_build_load_to_sgpr(&ctx.ac, buf_ptr,
-                                                LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
-
-       LLVMValueRef voffset =
-               LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id,
-                            LLVMConstInt(ctx.ac.i32, 4, 0), "");
-
-       /* Fetch the vertex stream ID.*/
-       LLVMValueRef stream_id;
-
-       if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
-               stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
-       else
-               stream_id = ctx.ac.i32_0;
-
-       /* Fill in output information. */
-       for (i = 0; i < gsinfo->num_outputs; ++i) {
-               outputs[i].semantic_name = gsinfo->output_semantic_name[i];
-               outputs[i].semantic_index = gsinfo->output_semantic_index[i];
-
-               for (int chan = 0; chan < 4; chan++) {
-                       outputs[i].vertex_stream[chan] =
-                               (gsinfo->output_streams[i] >> (2 * chan)) & 3;
-               }
-       }
-
-       LLVMBasicBlockRef end_bb;
-       LLVMValueRef switch_inst;
-
-       end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
-       switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
-
-       for (int stream = 0; stream < 4; stream++) {
-               LLVMBasicBlockRef bb;
-               unsigned offset;
-
-               if (!gsinfo->num_stream_output_components[stream])
-                       continue;
-
-               if (stream > 0 && !gs_selector->so.num_outputs)
-                       continue;
-
-               bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
-               LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
-               LLVMPositionBuilderAtEnd(builder, bb);
-
-               /* Fetch vertex data from GSVS ring */
-               offset = 0;
-               for (i = 0; i < gsinfo->num_outputs; ++i) {
-                       for (unsigned chan = 0; chan < 4; chan++) {
-                               if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
-                                   outputs[i].vertex_stream[chan] != stream) {
-                                       outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
-                                       continue;
-                               }
-
-                               LLVMValueRef soffset = LLVMConstInt(ctx.ac.i32,
-                                       offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
-                               offset++;
-
-                               outputs[i].values[chan] =
-                                       ac_build_buffer_load(&ctx.ac,
-                                                            ctx.gsvs_ring[0], 1,
-                                                            ctx.ac.i32_0, voffset,
-                                                            soffset, 0, ac_glc | ac_slc,
-                                                            true, false);
-                       }
-               }
-
-               /* Streamout and exports. */
-               if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
-                       si_llvm_emit_streamout(&ctx, outputs,
-                                              gsinfo->num_outputs,
-                                              stream);
-               }
-
-               if (stream == 0)
-                       si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
-
-               LLVMBuildBr(builder, end_bb);
-       }
-
-       LLVMPositionBuilderAtEnd(builder, end_bb);
-
-       LLVMBuildRetVoid(ctx.ac.builder);
-
-       ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
-       si_llvm_optimize_module(&ctx);
-
-       bool ok = false;
-       if (si_compile_llvm(sscreen, &ctx.shader->binary,
-                           &ctx.shader->config, ctx.compiler, &ctx.ac,
-                           debug, PIPE_SHADER_GEOMETRY,
-                           "GS Copy Shader", false)) {
-               if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
-                       fprintf(stderr, "GS Copy Shader:\n");
-               si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
-
-               if (!ctx.shader->config.scratch_bytes_per_wave)
-                       ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
-               else
-                       ok = true;
-       }
-
-       si_llvm_dispose(&ctx);
-
-       if (!ok) {
-               FREE(shader);
-               shader = NULL;
-       } else {
-               si_fix_resource_usage(sscreen, shader);
-       }
-       return shader;
+   si_llvm_dispose(&ctx);
+
+   if (!ok) {
+      FREE(shader);
+      shader = NULL;
+   } else {
+      si_fix_resource_usage(sscreen, shader);
+   }
+   return shader;
  }
  
  /**
   * Build the GS prolog function. Rotate the input vertices for triangle strips
   * with adjacency.
   */
-void si_llvm_build_gs_prolog(struct si_shader_context *ctx,
-                            union si_shader_part_key *key)
+void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
  {
-       unsigned num_sgprs, num_vgprs;
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMTypeRef returns[AC_MAX_ARGS];
-       LLVMValueRef func, ret;
-
-       memset(&ctx->args, 0, sizeof(ctx->args));
-
-       if (ctx->screen->info.chip_class >= GFX9) {
-               if (key->gs_prolog.states.gfx9_prev_is_vs)
-                       num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
-               else
-                       num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
-               num_vgprs = 5; /* ES inputs are not needed by GS */
-       } else {
-               num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
-               num_vgprs = 8;
-       }
-
-       for (unsigned i = 0; i < num_sgprs; ++i) {
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               returns[i] = ctx->ac.i32;
-       }
-
-       for (unsigned i = 0; i < num_vgprs; ++i) {
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
-               returns[num_sgprs + i] = ctx->ac.f32;
-       }
-
-       /* Create the function. */
-       si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
-       func = ctx->main_fn;
-
-       /* Set the full EXEC mask for the prolog, because we are only fiddling
-        * with registers here. The main shader part will set the correct EXEC
-        * mask.
-        */
-       if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
-               ac_init_exec_full_mask(&ctx->ac);
-
-       /* Copy inputs to outputs. This should be no-op, as the registers match,
-        * but it will prevent the compiler from overwriting them unintentionally.
-        */
-       ret = ctx->return_value;
-       for (unsigned i = 0; i < num_sgprs; i++) {
-               LLVMValueRef p = LLVMGetParam(func, i);
-               ret = LLVMBuildInsertValue(builder, ret, p, i, "");
-       }
-       for (unsigned i = 0; i < num_vgprs; i++) {
-               LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
-               p = ac_to_float(&ctx->ac, p);
-               ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
-       }
-
-       if (key->gs_prolog.states.tri_strip_adj_fix) {
-               /* Remap the input vertices for every other primitive. */
-               const struct ac_arg gfx6_vtx_params[6] = {
-                       { .used = true, .arg_index = num_sgprs },
-                       { .used = true, .arg_index = num_sgprs + 1 },
-                       { .used = true, .arg_index = num_sgprs + 3 },
-                       { .used = true, .arg_index = num_sgprs + 4 },
-                       { .used = true, .arg_index = num_sgprs + 5 },
-                       { .used = true, .arg_index = num_sgprs + 6 },
-               };
-               const struct ac_arg gfx9_vtx_params[3] = {
-                       { .used = true, .arg_index = num_sgprs },
-                       { .used = true, .arg_index = num_sgprs + 1 },
-                       { .used = true, .arg_index = num_sgprs + 4 },
-               };
-               LLVMValueRef vtx_in[6], vtx_out[6];
-               LLVMValueRef prim_id, rotate;
-
-               if (ctx->screen->info.chip_class >= GFX9) {
-                       for (unsigned i = 0; i < 3; i++) {
-                               vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
-                               vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
-                       }
-               } else {
-                       for (unsigned i = 0; i < 6; i++)
-                               vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
-               }
-
-               prim_id = LLVMGetParam(func, num_sgprs + 2);
-               rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
-
-               for (unsigned i = 0; i < 6; ++i) {
-                       LLVMValueRef base, rotated;
-                       base = vtx_in[i];
-                       rotated = vtx_in[(i + 4) % 6];
-                       vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
-               }
-
-               if (ctx->screen->info.chip_class >= GFX9) {
-                       for (unsigned i = 0; i < 3; i++) {
-                               LLVMValueRef hi, out;
-
-                               hi = LLVMBuildShl(builder, vtx_out[i*2+1],
-                                                 LLVMConstInt(ctx->ac.i32, 16, 0), "");
-                               out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
-                               out = ac_to_float(&ctx->ac, out);
-                               ret = LLVMBuildInsertValue(builder, ret, out,
-                                                          gfx9_vtx_params[i].arg_index, "");
-                       }
-               } else {
-                       for (unsigned i = 0; i < 6; i++) {
-                               LLVMValueRef out;
-
-                               out = ac_to_float(&ctx->ac, vtx_out[i]);
-                               ret = LLVMBuildInsertValue(builder, ret, out,
-                                                          gfx6_vtx_params[i].arg_index, "");
-                       }
-               }
-       }
-
-       LLVMBuildRet(builder, ret);
+   unsigned num_sgprs, num_vgprs;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMTypeRef returns[AC_MAX_ARGS];
+   LLVMValueRef func, ret;
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   if (ctx->screen->info.chip_class >= GFX9) {
+      if (key->gs_prolog.states.gfx9_prev_is_vs)
+         num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
+      else
+         num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
+      num_vgprs = 5; /* ES inputs are not needed by GS */
+   } else {
+      num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
+      num_vgprs = 8;
+   }
+
+   for (unsigned i = 0; i < num_sgprs; ++i) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      returns[i] = ctx->ac.i32;
+   }
+
+   for (unsigned i = 0; i < num_vgprs; ++i) {
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+      returns[num_sgprs + i] = ctx->ac.f32;
+   }
+
+   /* Create the function. */
+   si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
+   func = ctx->main_fn;
+
+   /* Set the full EXEC mask for the prolog, because we are only fiddling
+    * with registers here. The main shader part will set the correct EXEC
+    * mask.
+    */
+   if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
+      ac_init_exec_full_mask(&ctx->ac);
+
+   /* Copy inputs to outputs. This should be no-op, as the registers match,
+    * but it will prevent the compiler from overwriting them unintentionally.
+    */
+   ret = ctx->return_value;
+   for (unsigned i = 0; i < num_sgprs; i++) {
+      LLVMValueRef p = LLVMGetParam(func, i);
+      ret = LLVMBuildInsertValue(builder, ret, p, i, "");
+   }
+   for (unsigned i = 0; i < num_vgprs; i++) {
+      LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
+      p = ac_to_float(&ctx->ac, p);
+      ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
+   }
+
+   if (key->gs_prolog.states.tri_strip_adj_fix) {
+      /* Remap the input vertices for every other primitive. */
+      const struct ac_arg gfx6_vtx_params[6] = {
+         {.used = true, .arg_index = num_sgprs},     {.used = true, .arg_index = num_sgprs + 1},
+         {.used = true, .arg_index = num_sgprs + 3}, {.used = true, .arg_index = num_sgprs + 4},
+         {.used = true, .arg_index = num_sgprs + 5}, {.used = true, .arg_index = num_sgprs + 6},
+      };
+      const struct ac_arg gfx9_vtx_params[3] = {
+         {.used = true, .arg_index = num_sgprs},
+         {.used = true, .arg_index = num_sgprs + 1},
+         {.used = true, .arg_index = num_sgprs + 4},
+      };
+      LLVMValueRef vtx_in[6], vtx_out[6];
+      LLVMValueRef prim_id, rotate;
+
+      if (ctx->screen->info.chip_class >= GFX9) {
+         for (unsigned i = 0; i < 3; i++) {
+            vtx_in[i * 2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
+            vtx_in[i * 2 + 1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
+         }
+      } else {
+         for (unsigned i = 0; i < 6; i++)
+            vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
+      }
+
+      prim_id = LLVMGetParam(func, num_sgprs + 2);
+      rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
+
+      for (unsigned i = 0; i < 6; ++i) {
+         LLVMValueRef base, rotated;
+         base = vtx_in[i];
+         rotated = vtx_in[(i + 4) % 6];
+         vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
+      }
+
+      if (ctx->screen->info.chip_class >= GFX9) {
+         for (unsigned i = 0; i < 3; i++) {
+            LLVMValueRef hi, out;
+
+            hi = LLVMBuildShl(builder, vtx_out[i * 2 + 1], LLVMConstInt(ctx->ac.i32, 16, 0), "");
+            out = LLVMBuildOr(builder, vtx_out[i * 2], hi, "");
+            out = ac_to_float(&ctx->ac, out);
+            ret = LLVMBuildInsertValue(builder, ret, out, gfx9_vtx_params[i].arg_index, "");
+         }
+      } else {
+         for (unsigned i = 0; i < 6; i++) {
+            LLVMValueRef out;
+
+            out = ac_to_float(&ctx->ac, vtx_out[i]);
+            ret = LLVMBuildInsertValue(builder, ret, out, gfx6_vtx_params[i].arg_index, "");
+         }
+      }
+   }
+
+   LLVMBuildRet(builder, ret);
  }
  
  void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
  {
-       ctx->abi.load_inputs = si_nir_load_input_gs;
-       ctx->abi.emit_vertex = si_llvm_emit_vertex;
-       ctx->abi.emit_primitive = si_llvm_emit_primitive;
-       ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
+   ctx->abi.load_inputs = si_nir_load_input_gs;
+   ctx->abi.emit_vertex = si_llvm_emit_vertex;
+   ctx->abi.emit_primitive = si_llvm_emit_primitive;
+   ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
  }
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c

index c2efcc88e991f366e5a4c74d85bd48d3cccf3a64..6e4d5d429c7bd5f6fdd8f1c66d4b3a0a570f4b80 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
@@ -22,117 +22,108 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "si_shader_internal.h"
  #include "si_pipe.h"
+#include "si_shader_internal.h"
  #include "sid.h"
  
  LLVMValueRef si_get_sample_id(struct si_shader_context *ctx)
  {
-       return si_unpack_param(ctx, ctx->args.ancillary, 8, 4);
+   return si_unpack_param(ctx, ctx->args.ancillary, 8, 4);
  }
  
  static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       return ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.sample_coverage));
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   return ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.sample_coverage));
  }
  
  static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       LLVMValueRef desc = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-       LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
-       LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
-
-       /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
-       LLVMValueRef offset0 = LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->ac.i32, 8, 0), "");
-       LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->ac.i32, 4, 0), "");
-
-       LLVMValueRef pos[4] = {
-               si_buffer_load_const(ctx, resource, offset0),
-               si_buffer_load_const(ctx, resource, offset1),
-               LLVMConstReal(ctx->ac.f32, 0),
-               LLVMConstReal(ctx->ac.f32, 0)
-       };
-
-       return ac_build_gather_values(&ctx->ac, pos, 4);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMValueRef desc = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
+   LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
+
+   /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
+   LLVMValueRef offset0 =
+      LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->ac.i32, 8, 0), "");
+   LLVMValueRef offset1 =
+      LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+   LLVMValueRef pos[4] = {si_buffer_load_const(ctx, resource, offset0),
+                          si_buffer_load_const(ctx, resource, offset1),
+                          LLVMConstReal(ctx->ac.f32, 0), LLVMConstReal(ctx->ac.f32, 0)};
+
+   return ac_build_gather_values(&ctx->ac, pos, 4);
  }
  
  static LLVMValueRef si_nir_emit_fbfetch(struct ac_shader_abi *abi)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct ac_image_args args = {};
-       LLVMValueRef ptr, image, fmask;
-
-       /* Ignore src0, because KHR_blend_func_extended disallows multiple render
-        * targets.
-        */
-
-       /* Load the image descriptor. */
-       STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
-       ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-       ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr,
-                                  ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
-       image = ac_build_load_to_sgpr(&ctx->ac, ptr,
-                       LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0 / 2, 0));
-
-       unsigned chan = 0;
-
-       args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 0, 16);
-
-       if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D)
-               args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 16, 16);
-
-       /* Get the current render target layer index. */
-       if (ctx->shader->key.mono.u.ps.fbfetch_layered)
-               args.coords[chan++] = si_unpack_param(ctx, ctx->args.ancillary, 16, 11);
-
-       if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
-               args.coords[chan++] = si_get_sample_id(ctx);
-
-       if (ctx->shader->key.mono.u.ps.fbfetch_msaa &&
-           !(ctx->screen->debug_flags & DBG(NO_FMASK))) {
-               fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
-                       LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
-
-               ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords,
-                                        ctx->shader->key.mono.u.ps.fbfetch_layered);
-       }
-
-       args.opcode = ac_image_load;
-       args.resource = image;
-       args.dmask = 0xf;
-       args.attributes = AC_FUNC_ATTR_READNONE;
-
-       if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
-               args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
-                       ac_image_2darraymsaa : ac_image_2dmsaa;
-       else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D)
-               args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
-                       ac_image_1darray : ac_image_1d;
-       else
-               args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
-                       ac_image_2darray : ac_image_2d;
-
-       return ac_build_image_opcode(&ctx->ac, &args);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct ac_image_args args = {};
+   LLVMValueRef ptr, image, fmask;
+
+   /* Ignore src0, because KHR_blend_func_extended disallows multiple render
+    * targets.
+    */
+
+   /* Load the image descriptor. */
+   STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
+   ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   ptr =
+      LLVMBuildPointerCast(ctx->ac.builder, ptr, ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
+   image =
+      ac_build_load_to_sgpr(&ctx->ac, ptr, LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0 / 2, 0));
+
+   unsigned chan = 0;
+
+   args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 0, 16);
+
+   if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D)
+      args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 16, 16);
+
+   /* Get the current render target layer index. */
+   if (ctx->shader->key.mono.u.ps.fbfetch_layered)
+      args.coords[chan++] = si_unpack_param(ctx, ctx->args.ancillary, 16, 11);
+
+   if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
+      args.coords[chan++] = si_get_sample_id(ctx);
+
+   if (ctx->shader->key.mono.u.ps.fbfetch_msaa && !(ctx->screen->debug_flags & DBG(NO_FMASK))) {
+      fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
+                                    LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
+
+      ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords,
+                               ctx->shader->key.mono.u.ps.fbfetch_layered);
+   }
+
+   args.opcode = ac_image_load;
+   args.resource = image;
+   args.dmask = 0xf;
+   args.attributes = AC_FUNC_ATTR_READNONE;
+
+   if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
+      args.dim =
+         ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_2darraymsaa : ac_image_2dmsaa;
+   else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D)
+      args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_1darray : ac_image_1d;
+   else
+      args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_2darray : ac_image_2d;
+
+   return ac_build_image_opcode(&ctx->ac, &args);
  }
  
-static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
-                                      unsigned attr_index, unsigned chan,
-                                      LLVMValueRef prim_mask,
-                                      LLVMValueRef i, LLVMValueRef j)
+static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, unsigned attr_index,
+                                       unsigned chan, LLVMValueRef prim_mask, LLVMValueRef i,
+                                       LLVMValueRef j)
  {
-       if (i || j) {
-               return ac_build_fs_interp(&ctx->ac,
-                                         LLVMConstInt(ctx->ac.i32, chan, 0),
-                                         LLVMConstInt(ctx->ac.i32, attr_index, 0),
-                                         prim_mask, i, j);
-       }
-       return ac_build_fs_interp_mov(&ctx->ac,
-                                     LLVMConstInt(ctx->ac.i32, 2, 0), /* P0 */
-                                     LLVMConstInt(ctx->ac.i32, chan, 0),
-                                     LLVMConstInt(ctx->ac.i32, attr_index, 0),
-                                     prim_mask);
+   if (i || j) {
+      return ac_build_fs_interp(&ctx->ac, LLVMConstInt(ctx->ac.i32, chan, 0),
+                                LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask, i, j);
+   }
+   return ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, 2, 0), /* P0 */
+                                 LLVMConstInt(ctx->ac.i32, chan, 0),
+                                 LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask);
  }
  
  /**
@@ -149,345 +140,300 @@ static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
   * @param face                 SI_PARAM_FRONT_FACE
   * @param result               the return value (4 components)
   */
-static void interp_fs_color(struct si_shader_context *ctx,
-                           unsigned input_index,
-                           unsigned semantic_index,
-                           unsigned num_interp_inputs,
-                           unsigned colors_read_mask,
-                           LLVMValueRef interp_param,
-                           LLVMValueRef prim_mask,
-                           LLVMValueRef face,
-                           LLVMValueRef result[4])
+static void interp_fs_color(struct si_shader_context *ctx, unsigned input_index,
+                            unsigned semantic_index, unsigned num_interp_inputs,
+                            unsigned colors_read_mask, LLVMValueRef interp_param,
+                            LLVMValueRef prim_mask, LLVMValueRef face, LLVMValueRef result[4])
  {
-       LLVMValueRef i = NULL, j = NULL;
-       unsigned chan;
-
-       /* fs.constant returns the param from the middle vertex, so it's not
-        * really useful for flat shading. It's meant to be used for custom
-        * interpolation (but the intrinsic can't fetch from the other two
-        * vertices).
-        *
-        * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
-        * to do the right thing. The only reason we use fs.constant is that
-        * fs.interp cannot be used on integers, because they can be equal
-        * to NaN.
-        *
-        * When interp is false we will use fs.constant or for newer llvm,
-         * amdgcn.interp.mov.
-        */
-       bool interp = interp_param != NULL;
-
-       if (interp) {
-               interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param,
-                                               LLVMVectorType(ctx->ac.f32, 2), "");
-
-               i = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
-                                               ctx->ac.i32_0, "");
-               j = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
-                                               ctx->ac.i32_1, "");
-       }
-
-       if (ctx->shader->key.part.ps.prolog.color_two_side) {
-               LLVMValueRef is_face_positive;
-
-               /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
-                * otherwise it's at offset "num_inputs".
-                */
-               unsigned back_attr_offset = num_interp_inputs;
-               if (semantic_index == 1 && colors_read_mask & 0xf)
-                       back_attr_offset += 1;
-
-               is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
-                                                face, ctx->ac.i32_0, "");
-
-               for (chan = 0; chan < 4; chan++) {
-                       LLVMValueRef front, back;
-
-                       front = si_build_fs_interp(ctx,
-                                                  input_index, chan,
-                                                  prim_mask, i, j);
-                       back = si_build_fs_interp(ctx,
-                                                 back_attr_offset, chan,
-                                                 prim_mask, i, j);
-
-                       result[chan] = LLVMBuildSelect(ctx->ac.builder,
-                                               is_face_positive,
-                                               front,
-                                               back,
-                                               "");
-               }
-       } else {
-               for (chan = 0; chan < 4; chan++) {
-                       result[chan] = si_build_fs_interp(ctx,
-                                                         input_index, chan,
-                                                         prim_mask, i, j);
-               }
-       }
+   LLVMValueRef i = NULL, j = NULL;
+   unsigned chan;
+
+   /* fs.constant returns the param from the middle vertex, so it's not
+    * really useful for flat shading. It's meant to be used for custom
+    * interpolation (but the intrinsic can't fetch from the other two
+    * vertices).
+    *
+    * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
+    * to do the right thing. The only reason we use fs.constant is that
+    * fs.interp cannot be used on integers, because they can be equal
+    * to NaN.
+    *
+    * When interp is false we will use fs.constant or for newer llvm,
+    * amdgcn.interp.mov.
+    */
+   bool interp = interp_param != NULL;
+
+   if (interp) {
+      interp_param =
+         LLVMBuildBitCast(ctx->ac.builder, interp_param, LLVMVectorType(ctx->ac.f32, 2), "");
+
+      i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
+      j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
+   }
+
+   if (ctx->shader->key.part.ps.prolog.color_two_side) {
+      LLVMValueRef is_face_positive;
+
+      /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
+       * otherwise it's at offset "num_inputs".
+       */
+      unsigned back_attr_offset = num_interp_inputs;
+      if (semantic_index == 1 && colors_read_mask & 0xf)
+         back_attr_offset += 1;
+
+      is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, face, ctx->ac.i32_0, "");
+
+      for (chan = 0; chan < 4; chan++) {
+         LLVMValueRef front, back;
+
+         front = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
+         back = si_build_fs_interp(ctx, back_attr_offset, chan, prim_mask, i, j);
+
+         result[chan] = LLVMBuildSelect(ctx->ac.builder, is_face_positive, front, back, "");
+      }
+   } else {
+      for (chan = 0; chan < 4; chan++) {
+         result[chan] = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
+      }
+   }
  }
  
  static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha)
  {
-       if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
-               static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
-                       [PIPE_FUNC_LESS] = LLVMRealOLT,
-                       [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
-                       [PIPE_FUNC_LEQUAL] = LLVMRealOLE,
-                       [PIPE_FUNC_GREATER] = LLVMRealOGT,
-                       [PIPE_FUNC_NOTEQUAL] = LLVMRealONE,
-                       [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
-               };
-               LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func];
-               assert(cond);
-
-               LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
-                               SI_PARAM_ALPHA_REF);
-               LLVMValueRef alpha_pass =
-                       LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
-               ac_build_kill_if_false(&ctx->ac, alpha_pass);
-       } else {
-               ac_build_kill_if_false(&ctx->ac, ctx->ac.i1false);
-       }
+   if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
+      static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
+         [PIPE_FUNC_LESS] = LLVMRealOLT,     [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
+         [PIPE_FUNC_LEQUAL] = LLVMRealOLE,   [PIPE_FUNC_GREATER] = LLVMRealOGT,
+         [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
+      };
+      LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func];
+      assert(cond);
+
+      LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF);
+      LLVMValueRef alpha_pass = LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
+      ac_build_kill_if_false(&ctx->ac, alpha_pass);
+   } else {
+      ac_build_kill_if_false(&ctx->ac, ctx->ac.i1false);
+   }
  }
  
-static LLVMValueRef si_scale_alpha_by_sample_mask(struct si_shader_context *ctx,
-                                                 LLVMValueRef alpha,
-                                                 unsigned samplemask_param)
+static LLVMValueRef si_scale_alpha_by_sample_mask(struct si_shader_context *ctx, LLVMValueRef alpha,
+                                                  unsigned samplemask_param)
  {
-       LLVMValueRef coverage;
+   LLVMValueRef coverage;
  
-       /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
-       coverage = LLVMGetParam(ctx->main_fn,
-                               samplemask_param);
-       coverage = ac_to_integer(&ctx->ac, coverage);
+   /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
+   coverage = LLVMGetParam(ctx->main_fn, samplemask_param);
+   coverage = ac_to_integer(&ctx->ac, coverage);
  
-       coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32",
-                                  ctx->ac.i32,
-                                  &coverage, 1, AC_FUNC_ATTR_READNONE);
+   coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32", ctx->ac.i32, &coverage, 1,
+                                 AC_FUNC_ATTR_READNONE);
  
-       coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage,
-                                  ctx->ac.f32, "");
+   coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage, ctx->ac.f32, "");
  
-       coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
-                                LLVMConstReal(ctx->ac.f32,
-                                       1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
+   coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
+                            LLVMConstReal(ctx->ac.f32, 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
  
-       return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
+   return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
  }
  
  struct si_ps_exports {
-       unsigned num;
-       struct ac_export_args args[10];
+   unsigned num;
+   struct ac_export_args args[10];
  };
  
-static void si_export_mrt_z(struct si_shader_context *ctx,
-                           LLVMValueRef depth, LLVMValueRef stencil,
-                           LLVMValueRef samplemask, struct si_ps_exports *exp)
+static void si_export_mrt_z(struct si_shader_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
+                            LLVMValueRef samplemask, struct si_ps_exports *exp)
  {
-       struct ac_export_args args;
+   struct ac_export_args args;
  
-       ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
+   ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
  
-       memcpy(&exp->args[exp->num++], &args, sizeof(args));
+   memcpy(&exp->args[exp->num++], &args, sizeof(args));
  }
  
  /* Initialize arguments for the shader export intrinsic */
-static void si_llvm_init_ps_export_args(struct si_shader_context *ctx,
-                                       LLVMValueRef *values,
-                                       unsigned target,
-                                       struct ac_export_args *args)
+static void si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValueRef *values,
+                                        unsigned target, struct ac_export_args *args)
  {
-       const struct si_shader_key *key = &ctx->shader->key;
-       unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
-       LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
-       unsigned spi_shader_col_format;
-       unsigned chan;
-       bool is_int8, is_int10;
-       int cbuf = target - V_008DFC_SQ_EXP_MRT;
-
-       assert(cbuf >= 0 && cbuf < 8);
-
-       spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
-       is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
-       is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
-
-       /* Default is 0xf. Adjusted below depending on the format. */
-       args->enabled_channels = 0xf; /* writemask */
-
-       /* Specify whether the EXEC mask represents the valid mask */
-       args->valid_mask = 0;
-
-       /* Specify whether this is the last export */
-       args->done = 0;
-
-       /* Specify the target we are exporting */
-       args->target = target;
-
-       args->compr = false;
-       args->out[0] = f32undef;
-       args->out[1] = f32undef;
-       args->out[2] = f32undef;
-       args->out[3] = f32undef;
-
-       LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL;
-       LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2],
-                             unsigned bits, bool hi) = NULL;
-
-       switch (spi_shader_col_format) {
-       case V_028714_SPI_SHADER_ZERO:
-               args->enabled_channels = 0; /* writemask */
-               args->target = V_008DFC_SQ_EXP_NULL;
-               break;
-
-       case V_028714_SPI_SHADER_32_R:
-               args->enabled_channels = 1; /* writemask */
-               args->out[0] = values[0];
-               break;
-
-       case V_028714_SPI_SHADER_32_GR:
-               args->enabled_channels = 0x3; /* writemask */
-               args->out[0] = values[0];
-               args->out[1] = values[1];
-               break;
-
-       case V_028714_SPI_SHADER_32_AR:
-               if (ctx->screen->info.chip_class >= GFX10) {
-                       args->enabled_channels = 0x3; /* writemask */
-                       args->out[0] = values[0];
-                       args->out[1] = values[3];
-               } else {
-                       args->enabled_channels = 0x9; /* writemask */
-                       args->out[0] = values[0];
-                       args->out[3] = values[3];
-               }
-               break;
-
-       case V_028714_SPI_SHADER_FP16_ABGR:
-               packf = ac_build_cvt_pkrtz_f16;
-               break;
-
-       case V_028714_SPI_SHADER_UNORM16_ABGR:
-               packf = ac_build_cvt_pknorm_u16;
-               break;
-
-       case V_028714_SPI_SHADER_SNORM16_ABGR:
-               packf = ac_build_cvt_pknorm_i16;
-               break;
-
-       case V_028714_SPI_SHADER_UINT16_ABGR:
-               packi = ac_build_cvt_pk_u16;
-               break;
-
-       case V_028714_SPI_SHADER_SINT16_ABGR:
-               packi = ac_build_cvt_pk_i16;
-               break;
-
-       case V_028714_SPI_SHADER_32_ABGR:
-               memcpy(&args->out[0], values, sizeof(values[0]) * 4);
-               break;
-       }
-
-       /* Pack f16 or norm_i16/u16. */
-       if (packf) {
-               for (chan = 0; chan < 2; chan++) {
-                       LLVMValueRef pack_args[2] = {
-                               values[2 * chan],
-                               values[2 * chan + 1]
-                       };
-                       LLVMValueRef packed;
-
-                       packed = packf(&ctx->ac, pack_args);
-                       args->out[chan] = ac_to_float(&ctx->ac, packed);
-               }
-               args->compr = 1; /* COMPR flag */
-       }
-       /* Pack i16/u16. */
-       if (packi) {
-               for (chan = 0; chan < 2; chan++) {
-                       LLVMValueRef pack_args[2] = {
-                               ac_to_integer(&ctx->ac, values[2 * chan]),
-                               ac_to_integer(&ctx->ac, values[2 * chan + 1])
-                       };
-                       LLVMValueRef packed;
-
-                       packed = packi(&ctx->ac, pack_args,
-                                      is_int8 ? 8 : is_int10 ? 10 : 16,
-                                      chan == 1);
-                       args->out[chan] = ac_to_float(&ctx->ac, packed);
-               }
-               args->compr = 1; /* COMPR flag */
-       }
+   const struct si_shader_key *key = &ctx->shader->key;
+   unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
+   LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
+   unsigned spi_shader_col_format;
+   unsigned chan;
+   bool is_int8, is_int10;
+   int cbuf = target - V_008DFC_SQ_EXP_MRT;
+
+   assert(cbuf >= 0 && cbuf < 8);
+
+   spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
+   is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
+   is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
+
+   /* Default is 0xf. Adjusted below depending on the format. */
+   args->enabled_channels = 0xf; /* writemask */
+
+   /* Specify whether the EXEC mask represents the valid mask */
+   args->valid_mask = 0;
+
+   /* Specify whether this is the last export */
+   args->done = 0;
+
+   /* Specify the target we are exporting */
+   args->target = target;
+
+   args->compr = false;
+   args->out[0] = f32undef;
+   args->out[1] = f32undef;
+   args->out[2] = f32undef;
+   args->out[3] = f32undef;
+
+   LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
+   LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
+                         bool hi) = NULL;
+
+   switch (spi_shader_col_format) {
+   case V_028714_SPI_SHADER_ZERO:
+      args->enabled_channels = 0; /* writemask */
+      args->target = V_008DFC_SQ_EXP_NULL;
+      break;
+
+   case V_028714_SPI_SHADER_32_R:
+      args->enabled_channels = 1; /* writemask */
+      args->out[0] = values[0];
+      break;
+
+   case V_028714_SPI_SHADER_32_GR:
+      args->enabled_channels = 0x3; /* writemask */
+      args->out[0] = values[0];
+      args->out[1] = values[1];
+      break;
+
+   case V_028714_SPI_SHADER_32_AR:
+      if (ctx->screen->info.chip_class >= GFX10) {
+         args->enabled_channels = 0x3; /* writemask */
+         args->out[0] = values[0];
+         args->out[1] = values[3];
+      } else {
+         args->enabled_channels = 0x9; /* writemask */
+         args->out[0] = values[0];
+         args->out[3] = values[3];
+      }
+      break;
+
+   case V_028714_SPI_SHADER_FP16_ABGR:
+      packf = ac_build_cvt_pkrtz_f16;
+      break;
+
+   case V_028714_SPI_SHADER_UNORM16_ABGR:
+      packf = ac_build_cvt_pknorm_u16;
+      break;
+
+   case V_028714_SPI_SHADER_SNORM16_ABGR:
+      packf = ac_build_cvt_pknorm_i16;
+      break;
+
+   case V_028714_SPI_SHADER_UINT16_ABGR:
+      packi = ac_build_cvt_pk_u16;
+      break;
+
+   case V_028714_SPI_SHADER_SINT16_ABGR:
+      packi = ac_build_cvt_pk_i16;
+      break;
+
+   case V_028714_SPI_SHADER_32_ABGR:
+      memcpy(&args->out[0], values, sizeof(values[0]) * 4);
+      break;
+   }
+
+   /* Pack f16 or norm_i16/u16. */
+   if (packf) {
+      for (chan = 0; chan < 2; chan++) {
+         LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};
+         LLVMValueRef packed;
+
+         packed = packf(&ctx->ac, pack_args);
+         args->out[chan] = ac_to_float(&ctx->ac, packed);
+      }
+      args->compr = 1; /* COMPR flag */
+   }
+   /* Pack i16/u16. */
+   if (packi) {
+      for (chan = 0; chan < 2; chan++) {
+         LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),
+                                      ac_to_integer(&ctx->ac, values[2 * chan + 1])};
+         LLVMValueRef packed;
+
+         packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);
+         args->out[chan] = ac_to_float(&ctx->ac, packed);
+      }
+      args->compr = 1; /* COMPR flag */
+   }
  }
  
-static void si_export_mrt_color(struct si_shader_context *ctx,
-                               LLVMValueRef *color, unsigned index,
-                               unsigned samplemask_param,
-                               bool is_last, struct si_ps_exports *exp)
+static void si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *color, unsigned index,
+                                unsigned samplemask_param, bool is_last, struct si_ps_exports *exp)
  {
-       int i;
-
-       /* Clamp color */
-       if (ctx->shader->key.part.ps.epilog.clamp_color)
-               for (i = 0; i < 4; i++)
-                       color[i] = ac_build_clamp(&ctx->ac, color[i]);
-
-       /* Alpha to one */
-       if (ctx->shader->key.part.ps.epilog.alpha_to_one)
-               color[3] = ctx->ac.f32_1;
-
-       /* Alpha test */
-       if (index == 0 &&
-           ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
-               si_alpha_test(ctx, color[3]);
-
-       /* Line & polygon smoothing */
-       if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
-               color[3] = si_scale_alpha_by_sample_mask(ctx, color[3],
-                                                        samplemask_param);
-
-       /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-       if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
-               struct ac_export_args args[8];
-               int c, last = -1;
-
-               /* Get the export arguments, also find out what the last one is. */
-               for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
-                       si_llvm_init_ps_export_args(ctx, color,
-                                                   V_008DFC_SQ_EXP_MRT + c, &args[c]);
-                       if (args[c].enabled_channels)
-                               last = c;
-               }
-
-               /* Emit all exports. */
-               for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
-                       if (is_last && last == c) {
-                               args[c].valid_mask = 1; /* whether the EXEC mask is valid */
-                               args[c].done = 1; /* DONE bit */
-                       } else if (!args[c].enabled_channels)
-                               continue; /* unnecessary NULL export */
-
-                       memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
-               }
-       } else {
-               struct ac_export_args args;
-
-               /* Export */
-               si_llvm_init_ps_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index,
-                                           &args);
-               if (is_last) {
-                       args.valid_mask = 1; /* whether the EXEC mask is valid */
-                       args.done = 1; /* DONE bit */
-               } else if (!args.enabled_channels)
-                       return; /* unnecessary NULL export */
-
-               memcpy(&exp->args[exp->num++], &args, sizeof(args));
-       }
+   int i;
+
+   /* Clamp color */
+   if (ctx->shader->key.part.ps.epilog.clamp_color)
+      for (i = 0; i < 4; i++)
+         color[i] = ac_build_clamp(&ctx->ac, color[i]);
+
+   /* Alpha to one */
+   if (ctx->shader->key.part.ps.epilog.alpha_to_one)
+      color[3] = ctx->ac.f32_1;
+
+   /* Alpha test */
+   if (index == 0 && ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
+      si_alpha_test(ctx, color[3]);
+
+   /* Line & polygon smoothing */
+   if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
+      color[3] = si_scale_alpha_by_sample_mask(ctx, color[3], samplemask_param);
+
+   /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+   if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
+      struct ac_export_args args[8];
+      int c, last = -1;
+
+      /* Get the export arguments, also find out what the last one is. */
+      for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
+         si_llvm_init_ps_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + c, &args[c]);
+         if (args[c].enabled_channels)
+            last = c;
+      }
+
+      /* Emit all exports. */
+      for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
+         if (is_last && last == c) {
+            args[c].valid_mask = 1; /* whether the EXEC mask is valid */
+            args[c].done = 1;       /* DONE bit */
+         } else if (!args[c].enabled_channels)
+            continue; /* unnecessary NULL export */
+
+         memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
+      }
+   } else {
+      struct ac_export_args args;
+
+      /* Export */
+      si_llvm_init_ps_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index, &args);
+      if (is_last) {
+         args.valid_mask = 1; /* whether the EXEC mask is valid */
+         args.done = 1;       /* DONE bit */
+      } else if (!args.enabled_channels)
+         return; /* unnecessary NULL export */
+
+      memcpy(&exp->args[exp->num++], &args, sizeof(args));
+   }
  }
  
-static void si_emit_ps_exports(struct si_shader_context *ctx,
-                              struct si_ps_exports *exp)
+static void si_emit_ps_exports(struct si_shader_context *ctx, struct si_ps_exports *exp)
  {
-       for (unsigned i = 0; i < exp->num; i++)
-               ac_build_export(&ctx->ac, &exp->args[i]);
+   for (unsigned i = 0; i < exp->num; i++)
+      ac_build_export(&ctx->ac, &exp->args[i]);
  }
  
  /**
@@ -503,117 +449,108 @@ static void si_emit_ps_exports(struct si_shader_context *ctx,
   *
   * The alpha-ref SGPR is returned via its original location.
   */
-static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
-                                     unsigned max_outputs,
-                                     LLVMValueRef *addrs)
+static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi, unsigned max_outputs,
+                                      LLVMValueRef *addrs)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader *shader = ctx->shader;
-       struct si_shader_info *info = &shader->selector->info;
-       LLVMBuilderRef builder = ctx->ac.builder;
-       unsigned i, j, first_vgpr, vgpr;
-
-       LLVMValueRef color[8][4] = {};
-       LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
-       LLVMValueRef ret;
-
-       if (ctx->postponed_kill)
-               ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
-
-       /* Read the output values. */
-       for (i = 0; i < info->num_outputs; i++) {
-               unsigned semantic_name = info->output_semantic_name[i];
-               unsigned semantic_index = info->output_semantic_index[i];
-
-               switch (semantic_name) {
-               case TGSI_SEMANTIC_COLOR:
-                       assert(semantic_index < 8);
-                       for (j = 0; j < 4; j++) {
-                               LLVMValueRef ptr = addrs[4 * i + j];
-                               LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
-                               color[semantic_index][j] = result;
-                       }
-                       break;
-               case TGSI_SEMANTIC_POSITION:
-                       depth = LLVMBuildLoad(builder,
-                                             addrs[4 * i + 0], "");
-                       break;
-               case TGSI_SEMANTIC_STENCIL:
-                       stencil = LLVMBuildLoad(builder,
-                                               addrs[4 * i + 0], "");
-                       break;
-               case TGSI_SEMANTIC_SAMPLEMASK:
-                       samplemask = LLVMBuildLoad(builder,
-                                                  addrs[4 * i + 0], "");
-                       break;
-               default:
-                       fprintf(stderr, "Warning: GFX6 unhandled fs output type:%d\n",
-                               semantic_name);
-               }
-       }
-
-       /* Fill the return structure. */
-       ret = ctx->return_value;
-
-       /* Set SGPRs. */
-       ret = LLVMBuildInsertValue(builder, ret,
-                                  ac_to_integer(&ctx->ac,
-                                                 LLVMGetParam(ctx->main_fn,
-                                                              SI_PARAM_ALPHA_REF)),
-                                  SI_SGPR_ALPHA_REF, "");
-
-       /* Set VGPRs */
-       first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
-       for (i = 0; i < ARRAY_SIZE(color); i++) {
-               if (!color[i][0])
-                       continue;
-
-               for (j = 0; j < 4; j++)
-                       ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
-       }
-       if (depth)
-               ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
-       if (stencil)
-               ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
-       if (samplemask)
-               ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
-
-       /* Add the input sample mask for smoothing at the end. */
-       if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
-               vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
-       ret = LLVMBuildInsertValue(builder, ret,
-                                  LLVMGetParam(ctx->main_fn,
-                                               SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
-
-       ctx->return_value = ret;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader *shader = ctx->shader;
+   struct si_shader_info *info = &shader->selector->info;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   unsigned i, j, first_vgpr, vgpr;
+
+   LLVMValueRef color[8][4] = {};
+   LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+   LLVMValueRef ret;
+
+   if (ctx->postponed_kill)
+      ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
+
+   /* Read the output values. */
+   for (i = 0; i < info->num_outputs; i++) {
+      unsigned semantic_name = info->output_semantic_name[i];
+      unsigned semantic_index = info->output_semantic_index[i];
+
+      switch (semantic_name) {
+      case TGSI_SEMANTIC_COLOR:
+         assert(semantic_index < 8);
+         for (j = 0; j < 4; j++) {
+            LLVMValueRef ptr = addrs[4 * i + j];
+            LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
+            color[semantic_index][j] = result;
+         }
+         break;
+      case TGSI_SEMANTIC_POSITION:
+         depth = LLVMBuildLoad(builder, addrs[4 * i + 0], "");
+         break;
+      case TGSI_SEMANTIC_STENCIL:
+         stencil = LLVMBuildLoad(builder, addrs[4 * i + 0], "");
+         break;
+      case TGSI_SEMANTIC_SAMPLEMASK:
+         samplemask = LLVMBuildLoad(builder, addrs[4 * i + 0], "");
+         break;
+      default:
+         fprintf(stderr, "Warning: GFX6 unhandled fs output type:%d\n", semantic_name);
+      }
+   }
+
+   /* Fill the return structure. */
+   ret = ctx->return_value;
+
+   /* Set SGPRs. */
+   ret = LLVMBuildInsertValue(
+      builder, ret, ac_to_integer(&ctx->ac, LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF)),
+      SI_SGPR_ALPHA_REF, "");
+
+   /* Set VGPRs */
+   first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
+   for (i = 0; i < ARRAY_SIZE(color); i++) {
+      if (!color[i][0])
+         continue;
+
+      for (j = 0; j < 4; j++)
+         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
+   }
+   if (depth)
+      ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
+   if (stencil)
+      ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
+   if (samplemask)
+      ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
+
+   /* Add the input sample mask for smoothing at the end. */
+   if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
+      vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
+   ret = LLVMBuildInsertValue(builder, ret, LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE),
+                              vgpr++, "");
+
+   ctx->return_value = ret;
  }
  
  static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
-                                        LLVMValueRef param_rw_buffers,
-                                        struct ac_arg param_pos_fixed_pt)
+                                         LLVMValueRef param_rw_buffers,
+                                         struct ac_arg param_pos_fixed_pt)
  {
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef slot, desc, offset, row, bit, address[2];
-
-       /* Use the fixed-point gl_FragCoord input.
-        * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
-        * per coordinate to get the repeating effect.
-        */
-       address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5);
-       address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5);
-
-       /* Load the buffer descriptor. */
-       slot = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_POLY_STIPPLE, 0);
-       desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot);
-
-       /* The stipple pattern is 32x32, each row has 32 bits. */
-       offset = LLVMBuildMul(builder, address[1],
-                             LLVMConstInt(ctx->ac.i32, 4, 0), "");
-       row = si_buffer_load_const(ctx, desc, offset);
-       row = ac_to_integer(&ctx->ac, row);
-       bit = LLVMBuildLShr(builder, row, address[0], "");
-       bit = LLVMBuildTrunc(builder, bit, ctx->ac.i1, "");
-       ac_build_kill_if_false(&ctx->ac, bit);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef slot, desc, offset, row, bit, address[2];
+
+   /* Use the fixed-point gl_FragCoord input.
+    * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
+    * per coordinate to get the repeating effect.
+    */
+   address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5);
+   address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5);
+
+   /* Load the buffer descriptor. */
+   slot = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_POLY_STIPPLE, 0);
+   desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot);
+
+   /* The stipple pattern is 32x32, each row has 32 bits. */
+   offset = LLVMBuildMul(builder, address[1], LLVMConstInt(ctx->ac.i32, 4, 0), "");
+   row = si_buffer_load_const(ctx, desc, offset);
+   row = ac_to_integer(&ctx->ac, row);
+   bit = LLVMBuildLShr(builder, row, address[0], "");
+   bit = LLVMBuildTrunc(builder, bit, ctx->ac.i1, "");
+   ac_build_kill_if_false(&ctx->ac, bit);
  }
  
  /**
@@ -626,416 +563,372 @@ static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
   * overriden by other states. (e.g. per-sample interpolation)
   * Interpolated colors are stored after the preloaded VGPRs.
   */
-void si_llvm_build_ps_prolog(struct si_shader_context *ctx,
-                            union si_shader_part_key *key)
+void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
  {
-       LLVMValueRef ret, func;
-       int num_returns, i, num_color_channels;
-
-       memset(&ctx->args, 0, sizeof(ctx->args));
-
-       /* Declare inputs. */
-       LLVMTypeRef return_types[AC_MAX_ARGS];
-       num_returns = 0;
-       num_color_channels = util_bitcount(key->ps_prolog.colors_read);
-       assert(key->ps_prolog.num_input_sgprs +
-              key->ps_prolog.num_input_vgprs +
-              num_color_channels <= AC_MAX_ARGS);
-       for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) {
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               return_types[num_returns++] = ctx->ac.i32;
-
-       }
-
-       struct ac_arg pos_fixed_pt;
-       struct ac_arg ancillary;
-       struct ac_arg param_sample_mask;
-       for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) {
-               struct ac_arg *arg = NULL;
-               if (i == key->ps_prolog.ancillary_vgpr_index) {
-                       arg = &ancillary;
-               } else if (i == key->ps_prolog.ancillary_vgpr_index + 1) {
-                       arg = &param_sample_mask;
-               } else if (i == key->ps_prolog.num_input_vgprs - 1) {
-                       /* POS_FIXED_PT is always last. */
-                       arg = &pos_fixed_pt;
-               }
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, arg);
-               return_types[num_returns++] = ctx->ac.f32;
-       }
-
-       /* Declare outputs (same as inputs + add colors if needed) */
-       for (i = 0; i < num_color_channels; i++)
-               return_types[num_returns++] = ctx->ac.f32;
-
-       /* Create the function. */
-       si_llvm_create_func(ctx, "ps_prolog", return_types, num_returns, 0);
-       func = ctx->main_fn;
-
-       /* Copy inputs to outputs. This should be no-op, as the registers match,
-        * but it will prevent the compiler from overwriting them unintentionally.
-        */
-       ret = ctx->return_value;
-       for (i = 0; i < ctx->args.arg_count; i++) {
-               LLVMValueRef p = LLVMGetParam(func, i);
-               ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
-       }
-
-       /* Polygon stippling. */
-       if (key->ps_prolog.states.poly_stipple) {
-               LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
-
-               si_llvm_emit_polygon_stipple(ctx, list, pos_fixed_pt);
-       }
-
-       if (key->ps_prolog.states.bc_optimize_for_persp ||
-           key->ps_prolog.states.bc_optimize_for_linear) {
-               unsigned i, base = key->ps_prolog.num_input_sgprs;
-               LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
-
-               /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
-                * The hw doesn't compute CENTROID if the whole wave only
-                * contains fully-covered quads.
-                *
-                * PRIM_MASK is after user SGPRs.
-                */
-               bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
-               bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize,
-                                           LLVMConstInt(ctx->ac.i32, 31, 0), "");
-               bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize,
-                                            ctx->ac.i1, "");
-
-               if (key->ps_prolog.states.bc_optimize_for_persp) {
-                       /* Read PERSP_CENTER. */
-                       for (i = 0; i < 2; i++)
-                               center[i] = LLVMGetParam(func, base + 2 + i);
-                       /* Read PERSP_CENTROID. */
-                       for (i = 0; i < 2; i++)
-                               centroid[i] = LLVMGetParam(func, base + 4 + i);
-                       /* Select PERSP_CENTROID. */
-                       for (i = 0; i < 2; i++) {
-                               tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
-                                                     center[i], centroid[i], "");
-                               ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                                          tmp, base + 4 + i, "");
-                       }
-               }
-               if (key->ps_prolog.states.bc_optimize_for_linear) {
-                       /* Read LINEAR_CENTER. */
-                       for (i = 0; i < 2; i++)
-                               center[i] = LLVMGetParam(func, base + 8 + i);
-                       /* Read LINEAR_CENTROID. */
-                       for (i = 0; i < 2; i++)
-                               centroid[i] = LLVMGetParam(func, base + 10 + i);
-                       /* Select LINEAR_CENTROID. */
-                       for (i = 0; i < 2; i++) {
-                               tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
-                                                     center[i], centroid[i], "");
-                               ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                                          tmp, base + 10 + i, "");
-                       }
-               }
-       }
-
-       /* Force per-sample interpolation. */
-       if (key->ps_prolog.states.force_persp_sample_interp) {
-               unsigned i, base = key->ps_prolog.num_input_sgprs;
-               LLVMValueRef persp_sample[2];
-
-               /* Read PERSP_SAMPLE. */
-               for (i = 0; i < 2; i++)
-                       persp_sample[i] = LLVMGetParam(func, base + i);
-               /* Overwrite PERSP_CENTER. */
-               for (i = 0; i < 2; i++)
-                       ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                                  persp_sample[i], base + 2 + i, "");
-               /* Overwrite PERSP_CENTROID. */
-               for (i = 0; i < 2; i++)
-                       ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                                  persp_sample[i], base + 4 + i, "");
-       }
-       if (key->ps_prolog.states.force_linear_sample_interp) {
-               unsigned i, base = key->ps_prolog.num_input_sgprs;
-               LLVMValueRef linear_sample[2];
-
-               /* Read LINEAR_SAMPLE. */
-               for (i = 0; i < 2; i++)
-                       linear_sample[i] = LLVMGetParam(func, base + 6 + i);
-               /* Overwrite LINEAR_CENTER. */
-               for (i = 0; i < 2; i++)
-                       ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                                  linear_sample[i], base + 8 + i, "");
-               /* Overwrite LINEAR_CENTROID. */
-               for (i = 0; i < 2; i++)
-                       ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                                  linear_sample[i], base + 10 + i, "");
-       }
-
-       /* Force center interpolation. */
-       if (key->ps_prolog.states.force_persp_center_interp) {
-               unsigned i, base = key->ps_prolog.num_input_sgprs;
-               LLVMValueRef persp_center[2];
-
-               /* Read PERSP_CENTER. */
-               for (i = 0; i < 2; i++)
-                       persp_center[i] = LLVMGetParam(func, base + 2 + i);
-               /* Overwrite PERSP_SAMPLE. */
-               for (i = 0; i < 2; i++)
-                       ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                                  persp_center[i], base + i, "");
-               /* Overwrite PERSP_CENTROID. */
-               for (i = 0; i < 2; i++)
-                       ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                                  persp_center[i], base + 4 + i, "");
-       }
-       if (key->ps_prolog.states.force_linear_center_interp) {
-               unsigned i, base = key->ps_prolog.num_input_sgprs;
-               LLVMValueRef linear_center[2];
-
-               /* Read LINEAR_CENTER. */
-               for (i = 0; i < 2; i++)
-                       linear_center[i] = LLVMGetParam(func, base + 8 + i);
-               /* Overwrite LINEAR_SAMPLE. */
-               for (i = 0; i < 2; i++)
-                       ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                                  linear_center[i], base + 6 + i, "");
-               /* Overwrite LINEAR_CENTROID. */
-               for (i = 0; i < 2; i++)
-                       ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                                  linear_center[i], base + 10 + i, "");
-       }
-
-       /* Interpolate colors. */
-       unsigned color_out_idx = 0;
-       for (i = 0; i < 2; i++) {
-               unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
-               unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
-                                    key->ps_prolog.face_vgpr_index;
-               LLVMValueRef interp[2], color[4];
-               LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
-
-               if (!writemask)
-                       continue;
-
-               /* If the interpolation qualifier is not CONSTANT (-1). */
-               if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
-                       unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
-                                              key->ps_prolog.color_interp_vgpr_index[i];
-
-                       /* Get the (i,j) updated by bc_optimize handling. */
-                       interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret,
-                                                         interp_vgpr, "");
-                       interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret,
-                                                         interp_vgpr + 1, "");
-                       interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
-               }
-
-               /* Use the absolute location of the input. */
-               prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
-
-               if (key->ps_prolog.states.color_two_side) {
-                       face = LLVMGetParam(func, face_vgpr);
-                       face = ac_to_integer(&ctx->ac, face);
-               }
-
-               interp_fs_color(ctx,
-                               key->ps_prolog.color_attr_index[i], i,
-                               key->ps_prolog.num_interp_inputs,
-                               key->ps_prolog.colors_read, interp_ij,
-                               prim_mask, face, color);
-
-               while (writemask) {
-                       unsigned chan = u_bit_scan(&writemask);
-                       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
-                                                  ctx->args.arg_count + color_out_idx++, "");
-               }
-       }
-
-       /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
-        * says:
-        *
-        *    "When per-sample shading is active due to the use of a fragment
-        *     input qualified by sample or due to the use of the gl_SampleID
-        *     or gl_SamplePosition variables, only the bit for the current
-        *     sample is set in gl_SampleMaskIn. When state specifies multiple
-        *     fragment shader invocations for a given fragment, the sample
-        *     mask for any single fragment shader invocation may specify a
-        *     subset of the covered samples for the fragment. In this case,
-        *     the bit corresponding to each covered sample will be set in
-        *     exactly one fragment shader invocation."
-        *
-        * The samplemask loaded by hardware is always the coverage of the
-        * entire pixel/fragment, so mask bits out based on the sample ID.
-        */
-       if (key->ps_prolog.states.samplemask_log_ps_iter) {
-               /* The bit pattern matches that used by fixed function fragment
-                * processing. */
-               static const uint16_t ps_iter_masks[] = {
-                       0xffff, /* not used */
-                       0x5555,
-                       0x1111,
-                       0x0101,
-                       0x0001,
-               };
-               assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
-
-               uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
-               LLVMValueRef sampleid = si_unpack_param(ctx, ancillary, 8, 4);
-               LLVMValueRef samplemask = ac_get_arg(&ctx->ac, param_sample_mask);
-
-               samplemask = ac_to_integer(&ctx->ac, samplemask);
-               samplemask = LLVMBuildAnd(
-                       ctx->ac.builder,
-                       samplemask,
-                       LLVMBuildShl(ctx->ac.builder,
-                                    LLVMConstInt(ctx->ac.i32, ps_iter_mask, false),
-                                    sampleid, ""),
-                       "");
-               samplemask = ac_to_float(&ctx->ac, samplemask);
-
-               ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask,
-                                          param_sample_mask.arg_index, "");
-       }
-
-       /* Tell LLVM to insert WQM instruction sequence when needed. */
-       if (key->ps_prolog.wqm) {
-               LLVMAddTargetDependentFunctionAttr(func,
-                                                  "amdgpu-ps-wqm-outputs", "");
-       }
-
-       si_llvm_build_ret(ctx, ret);
+   LLVMValueRef ret, func;
+   int num_returns, i, num_color_channels;
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   /* Declare inputs. */
+   LLVMTypeRef return_types[AC_MAX_ARGS];
+   num_returns = 0;
+   num_color_channels = util_bitcount(key->ps_prolog.colors_read);
+   assert(key->ps_prolog.num_input_sgprs + key->ps_prolog.num_input_vgprs + num_color_channels <=
+          AC_MAX_ARGS);
+   for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      return_types[num_returns++] = ctx->ac.i32;
+   }
+
+   struct ac_arg pos_fixed_pt;
+   struct ac_arg ancillary;
+   struct ac_arg param_sample_mask;
+   for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) {
+      struct ac_arg *arg = NULL;
+      if (i == key->ps_prolog.ancillary_vgpr_index) {
+         arg = &ancillary;
+      } else if (i == key->ps_prolog.ancillary_vgpr_index + 1) {
+         arg = &param_sample_mask;
+      } else if (i == key->ps_prolog.num_input_vgprs - 1) {
+         /* POS_FIXED_PT is always last. */
+         arg = &pos_fixed_pt;
+      }
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, arg);
+      return_types[num_returns++] = ctx->ac.f32;
+   }
+
+   /* Declare outputs (same as inputs + add colors if needed) */
+   for (i = 0; i < num_color_channels; i++)
+      return_types[num_returns++] = ctx->ac.f32;
+
+   /* Create the function. */
+   si_llvm_create_func(ctx, "ps_prolog", return_types, num_returns, 0);
+   func = ctx->main_fn;
+
+   /* Copy inputs to outputs. This should be no-op, as the registers match,
+    * but it will prevent the compiler from overwriting them unintentionally.
+    */
+   ret = ctx->return_value;
+   for (i = 0; i < ctx->args.arg_count; i++) {
+      LLVMValueRef p = LLVMGetParam(func, i);
+      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
+   }
+
+   /* Polygon stippling. */
+   if (key->ps_prolog.states.poly_stipple) {
+      LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
+
+      si_llvm_emit_polygon_stipple(ctx, list, pos_fixed_pt);
+   }
+
+   if (key->ps_prolog.states.bc_optimize_for_persp ||
+       key->ps_prolog.states.bc_optimize_for_linear) {
+      unsigned i, base = key->ps_prolog.num_input_sgprs;
+      LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
+
+      /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
+       * The hw doesn't compute CENTROID if the whole wave only
+       * contains fully-covered quads.
+       *
+       * PRIM_MASK is after user SGPRs.
+       */
+      bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
+      bc_optimize =
+         LLVMBuildLShr(ctx->ac.builder, bc_optimize, LLVMConstInt(ctx->ac.i32, 31, 0), "");
+      bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, ctx->ac.i1, "");
+
+      if (key->ps_prolog.states.bc_optimize_for_persp) {
+         /* Read PERSP_CENTER. */
+         for (i = 0; i < 2; i++)
+            center[i] = LLVMGetParam(func, base + 2 + i);
+         /* Read PERSP_CENTROID. */
+         for (i = 0; i < 2; i++)
+            centroid[i] = LLVMGetParam(func, base + 4 + i);
+         /* Select PERSP_CENTROID. */
+         for (i = 0; i < 2; i++) {
+            tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center[i], centroid[i], "");
+            ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, base + 4 + i, "");
+         }
+      }
+      if (key->ps_prolog.states.bc_optimize_for_linear) {
+         /* Read LINEAR_CENTER. */
+         for (i = 0; i < 2; i++)
+            center[i] = LLVMGetParam(func, base + 8 + i);
+         /* Read LINEAR_CENTROID. */
+         for (i = 0; i < 2; i++)
+            centroid[i] = LLVMGetParam(func, base + 10 + i);
+         /* Select LINEAR_CENTROID. */
+         for (i = 0; i < 2; i++) {
+            tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center[i], centroid[i], "");
+            ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, base + 10 + i, "");
+         }
+      }
+   }
+
+   /* Force per-sample interpolation. */
+   if (key->ps_prolog.states.force_persp_sample_interp) {
+      unsigned i, base = key->ps_prolog.num_input_sgprs;
+      LLVMValueRef persp_sample[2];
+
+      /* Read PERSP_SAMPLE. */
+      for (i = 0; i < 2; i++)
+         persp_sample[i] = LLVMGetParam(func, base + i);
+      /* Overwrite PERSP_CENTER. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_sample[i], base + 2 + i, "");
+      /* Overwrite PERSP_CENTROID. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_sample[i], base + 4 + i, "");
+   }
+   if (key->ps_prolog.states.force_linear_sample_interp) {
+      unsigned i, base = key->ps_prolog.num_input_sgprs;
+      LLVMValueRef linear_sample[2];
+
+      /* Read LINEAR_SAMPLE. */
+      for (i = 0; i < 2; i++)
+         linear_sample[i] = LLVMGetParam(func, base + 6 + i);
+      /* Overwrite LINEAR_CENTER. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_sample[i], base + 8 + i, "");
+      /* Overwrite LINEAR_CENTROID. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_sample[i], base + 10 + i, "");
+   }
+
+   /* Force center interpolation. */
+   if (key->ps_prolog.states.force_persp_center_interp) {
+      unsigned i, base = key->ps_prolog.num_input_sgprs;
+      LLVMValueRef persp_center[2];
+
+      /* Read PERSP_CENTER. */
+      for (i = 0; i < 2; i++)
+         persp_center[i] = LLVMGetParam(func, base + 2 + i);
+      /* Overwrite PERSP_SAMPLE. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_center[i], base + i, "");
+      /* Overwrite PERSP_CENTROID. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_center[i], base + 4 + i, "");
+   }
+   if (key->ps_prolog.states.force_linear_center_interp) {
+      unsigned i, base = key->ps_prolog.num_input_sgprs;
+      LLVMValueRef linear_center[2];
+
+      /* Read LINEAR_CENTER. */
+      for (i = 0; i < 2; i++)
+         linear_center[i] = LLVMGetParam(func, base + 8 + i);
+      /* Overwrite LINEAR_SAMPLE. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_center[i], base + 6 + i, "");
+      /* Overwrite LINEAR_CENTROID. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_center[i], base + 10 + i, "");
+   }
+
+   /* Interpolate colors. */
+   unsigned color_out_idx = 0;
+   for (i = 0; i < 2; i++) {
+      unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
+      unsigned face_vgpr = key->ps_prolog.num_input_sgprs + key->ps_prolog.face_vgpr_index;
+      LLVMValueRef interp[2], color[4];
+      LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
+
+      if (!writemask)
+         continue;
+
+      /* If the interpolation qualifier is not CONSTANT (-1). */
+      if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
+         unsigned interp_vgpr =
+            key->ps_prolog.num_input_sgprs + key->ps_prolog.color_interp_vgpr_index[i];
+
+         /* Get the (i,j) updated by bc_optimize handling. */
+         interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret, interp_vgpr, "");
+         interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret, interp_vgpr + 1, "");
+         interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
+      }
+
+      /* Use the absolute location of the input. */
+      prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
+
+      if (key->ps_prolog.states.color_two_side) {
+         face = LLVMGetParam(func, face_vgpr);
+         face = ac_to_integer(&ctx->ac, face);
+      }
+
+      interp_fs_color(ctx, key->ps_prolog.color_attr_index[i], i, key->ps_prolog.num_interp_inputs,
+                      key->ps_prolog.colors_read, interp_ij, prim_mask, face, color);
+
+      while (writemask) {
+         unsigned chan = u_bit_scan(&writemask);
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
+                                    ctx->args.arg_count + color_out_idx++, "");
+      }
+   }
+
+   /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
+    * says:
+    *
+    *    "When per-sample shading is active due to the use of a fragment
+    *     input qualified by sample or due to the use of the gl_SampleID
+    *     or gl_SamplePosition variables, only the bit for the current
+    *     sample is set in gl_SampleMaskIn. When state specifies multiple
+    *     fragment shader invocations for a given fragment, the sample
+    *     mask for any single fragment shader invocation may specify a
+    *     subset of the covered samples for the fragment. In this case,
+    *     the bit corresponding to each covered sample will be set in
+    *     exactly one fragment shader invocation."
+    *
+    * The samplemask loaded by hardware is always the coverage of the
+    * entire pixel/fragment, so mask bits out based on the sample ID.
+    */
+   if (key->ps_prolog.states.samplemask_log_ps_iter) {
+      /* The bit pattern matches that used by fixed function fragment
+       * processing. */
+      static const uint16_t ps_iter_masks[] = {
+         0xffff, /* not used */
+         0x5555, 0x1111, 0x0101, 0x0001,
+      };
+      assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
+
+      uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
+      LLVMValueRef sampleid = si_unpack_param(ctx, ancillary, 8, 4);
+      LLVMValueRef samplemask = ac_get_arg(&ctx->ac, param_sample_mask);
+
+      samplemask = ac_to_integer(&ctx->ac, samplemask);
+      samplemask =
+         LLVMBuildAnd(ctx->ac.builder, samplemask,
+                      LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false),
+                                   sampleid, ""),
+                      "");
+      samplemask = ac_to_float(&ctx->ac, samplemask);
+
+      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask, param_sample_mask.arg_index, "");
+   }
+
+   /* Tell LLVM to insert WQM instruction sequence when needed. */
+   if (key->ps_prolog.wqm) {
+      LLVMAddTargetDependentFunctionAttr(func, "amdgpu-ps-wqm-outputs", "");
+   }
+
+   si_llvm_build_ret(ctx, ret);
  }
  
  /**
   * Build the pixel shader epilog function. This handles everything that must be
   * emulated for pixel shader exports. (alpha-test, format conversions, etc)
   */
-void si_llvm_build_ps_epilog(struct si_shader_context *ctx,
-                            union si_shader_part_key *key)
+void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
  {
-       LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
-       int i;
-       struct si_ps_exports exp = {};
-
-       memset(&ctx->args, 0, sizeof(ctx->args));
-
-       /* Declare input SGPRs. */
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->rw_buffers);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                  &ctx->bindless_samplers_and_images);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                  &ctx->const_and_shader_buffers);
-       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                  &ctx->samplers_and_images);
-       si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT,
-                          NULL, SI_PARAM_ALPHA_REF);
-
-       /* Declare input VGPRs. */
-       unsigned required_num_params =
-                    ctx->args.num_sgprs_used +
-                    util_bitcount(key->ps_epilog.colors_written) * 4 +
-                    key->ps_epilog.writes_z +
-                    key->ps_epilog.writes_stencil +
-                    key->ps_epilog.writes_samplemask;
-
-       required_num_params = MAX2(required_num_params,
-                                  ctx->args.num_sgprs_used + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
-
-       while (ctx->args.arg_count < required_num_params)
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
-
-       /* Create the function. */
-       si_llvm_create_func(ctx, "ps_epilog", NULL, 0, 0);
-       /* Disable elimination of unused inputs. */
-       ac_llvm_add_target_dep_function_attr(ctx->main_fn,
-                                            "InitialPSInputAddr", 0xffffff);
-
-       /* Process colors. */
-       unsigned vgpr = ctx->args.num_sgprs_used;
-       unsigned colors_written = key->ps_epilog.colors_written;
-       int last_color_export = -1;
-
-       /* Find the last color export. */
-       if (!key->ps_epilog.writes_z &&
-           !key->ps_epilog.writes_stencil &&
-           !key->ps_epilog.writes_samplemask) {
-               unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
-
-               /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-               if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
-                       /* Just set this if any of the colorbuffers are enabled. */
-                       if (spi_format &
-                           ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
-                               last_color_export = 0;
-               } else {
-                       for (i = 0; i < 8; i++)
-                               if (colors_written & (1 << i) &&
-                                   (spi_format >> (i * 4)) & 0xf)
-                                       last_color_export = i;
-               }
-       }
-
-       while (colors_written) {
-               LLVMValueRef color[4];
-               int mrt = u_bit_scan(&colors_written);
-
-               for (i = 0; i < 4; i++)
-                       color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
-
-               si_export_mrt_color(ctx, color, mrt,
-                                   ctx->args.arg_count - 1,
-                                   mrt == last_color_export, &exp);
-       }
-
-       /* Process depth, stencil, samplemask. */
-       if (key->ps_epilog.writes_z)
-               depth = LLVMGetParam(ctx->main_fn, vgpr++);
-       if (key->ps_epilog.writes_stencil)
-               stencil = LLVMGetParam(ctx->main_fn, vgpr++);
-       if (key->ps_epilog.writes_samplemask)
-               samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
-
-       if (depth || stencil || samplemask)
-               si_export_mrt_z(ctx, depth, stencil, samplemask, &exp);
-       else if (last_color_export == -1)
-               ac_build_export_null(&ctx->ac);
-
-       if (exp.num)
-               si_emit_ps_exports(ctx, &exp);
-
-       /* Compile. */
-       LLVMBuildRetVoid(ctx->ac.builder);
+   LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+   int i;
+   struct si_ps_exports exp = {};
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   /* Declare input SGPRs. */
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->rw_buffers);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->bindless_samplers_and_images);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->const_and_shader_buffers);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->samplers_and_images);
+   si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_ALPHA_REF);
+
+   /* Declare input VGPRs. */
+   unsigned required_num_params =
+      ctx->args.num_sgprs_used + util_bitcount(key->ps_epilog.colors_written) * 4 +
+      key->ps_epilog.writes_z + key->ps_epilog.writes_stencil + key->ps_epilog.writes_samplemask;
+
+   required_num_params =
+      MAX2(required_num_params, ctx->args.num_sgprs_used + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+   while (ctx->args.arg_count < required_num_params)
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
+
+   /* Create the function. */
+   si_llvm_create_func(ctx, "ps_epilog", NULL, 0, 0);
+   /* Disable elimination of unused inputs. */
+   ac_llvm_add_target_dep_function_attr(ctx->main_fn, "InitialPSInputAddr", 0xffffff);
+
+   /* Process colors. */
+   unsigned vgpr = ctx->args.num_sgprs_used;
+   unsigned colors_written = key->ps_epilog.colors_written;
+   int last_color_export = -1;
+
+   /* Find the last color export. */
+   if (!key->ps_epilog.writes_z && !key->ps_epilog.writes_stencil &&
+       !key->ps_epilog.writes_samplemask) {
+      unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
+
+      /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+      if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
+         /* Just set this if any of the colorbuffers are enabled. */
+         if (spi_format & ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
+            last_color_export = 0;
+      } else {
+         for (i = 0; i < 8; i++)
+            if (colors_written & (1 << i) && (spi_format >> (i * 4)) & 0xf)
+               last_color_export = i;
+      }
+   }
+
+   while (colors_written) {
+      LLVMValueRef color[4];
+      int mrt = u_bit_scan(&colors_written);
+
+      for (i = 0; i < 4; i++)
+         color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
+
+      si_export_mrt_color(ctx, color, mrt, ctx->args.arg_count - 1, mrt == last_color_export, &exp);
+   }
+
+   /* Process depth, stencil, samplemask. */
+   if (key->ps_epilog.writes_z)
+      depth = LLVMGetParam(ctx->main_fn, vgpr++);
+   if (key->ps_epilog.writes_stencil)
+      stencil = LLVMGetParam(ctx->main_fn, vgpr++);
+   if (key->ps_epilog.writes_samplemask)
+      samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
+
+   if (depth || stencil || samplemask)
+      si_export_mrt_z(ctx, depth, stencil, samplemask, &exp);
+   else if (last_color_export == -1)
+      ac_build_export_null(&ctx->ac);
+
+   if (exp.num)
+      si_emit_ps_exports(ctx, &exp);
+
+   /* Compile. */
+   LLVMBuildRetVoid(ctx->ac.builder);
  }
  
-void si_llvm_build_monolithic_ps(struct si_shader_context *ctx,
-                                struct si_shader *shader)
+void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader *shader)
  {
-       LLVMValueRef parts[3];
-       unsigned num_parts = 0, main_index;
+   LLVMValueRef parts[3];
+   unsigned num_parts = 0, main_index;
  
-       union si_shader_part_key prolog_key;
-       si_get_ps_prolog_key(shader, &prolog_key, false);
+   union si_shader_part_key prolog_key;
+   si_get_ps_prolog_key(shader, &prolog_key, false);
  
-       if (si_need_ps_prolog(&prolog_key)) {
-               si_llvm_build_ps_prolog(ctx, &prolog_key);
-               parts[num_parts++] = ctx->main_fn;
-       }
+   if (si_need_ps_prolog(&prolog_key)) {
+      si_llvm_build_ps_prolog(ctx, &prolog_key);
+      parts[num_parts++] = ctx->main_fn;
+   }
  
-       main_index = num_parts;
-       parts[num_parts++] = ctx->main_fn;
+   main_index = num_parts;
+   parts[num_parts++] = ctx->main_fn;
  
-       union si_shader_part_key epilog_key;
-       si_get_ps_epilog_key(shader, &epilog_key);
-       si_llvm_build_ps_epilog(ctx, &epilog_key);
-       parts[num_parts++] = ctx->main_fn;
+   union si_shader_part_key epilog_key;
+   si_get_ps_epilog_key(shader, &epilog_key);
+   si_llvm_build_ps_epilog(ctx, &epilog_key);
+   parts[num_parts++] = ctx->main_fn;
  
-       si_build_wrapper_function(ctx, parts, num_parts, main_index, 0);
+   si_build_wrapper_function(ctx, parts, num_parts, main_index, 0);
  }
  
  void si_llvm_init_ps_callbacks(struct si_shader_context *ctx)
  {
-       ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
-       ctx->abi.load_sample_position = load_sample_position;
-       ctx->abi.load_sample_mask_in = load_sample_mask_in;
-       ctx->abi.emit_fbfetch = si_nir_emit_fbfetch;
+   ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
+   ctx->abi.load_sample_position = load_sample_position;
+   ctx->abi.load_sample_mask_in = load_sample_mask_in;
+   ctx->abi.emit_fbfetch = si_nir_emit_fbfetch;
  }
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c b/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c

index cb06aa99ca705b93398905ceb2659f63efbe28c0..122e69762615bdc02796e90de553fc942098664a 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c
@@ -22,111 +22,98 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "si_shader_internal.h"
  #include "si_pipe.h"
+#include "si_shader_internal.h"
  #include "sid.h"
  
  /**
   * Return a value that is equal to the given i32 \p index if it lies in [0,num)
   * or an undefined value in the same interval otherwise.
   */
-static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx,
-                                LLVMValueRef index,
-                                unsigned num)
+static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, LLVMValueRef index,
+                                        unsigned num)
  {
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0);
-       LLVMValueRef cc;
-
-       if (util_is_power_of_two_or_zero(num)) {
-               index = LLVMBuildAnd(builder, index, c_max, "");
-       } else {
-               /* In theory, this MAX pattern should result in code that is
-                * as good as the bit-wise AND above.
-                *
-                * In practice, LLVM generates worse code (at the time of
-                * writing), because its value tracking is not strong enough.
-                */
-               cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, "");
-               index = LLVMBuildSelect(builder, cc, index, c_max, "");
-       }
-
-       return index;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0);
+   LLVMValueRef cc;
+
+   if (util_is_power_of_two_or_zero(num)) {
+      index = LLVMBuildAnd(builder, index, c_max, "");
+   } else {
+      /* In theory, this MAX pattern should result in code that is
+       * as good as the bit-wise AND above.
+       *
+       * In practice, LLVM generates worse code (at the time of
+       * writing), because its value tracking is not strong enough.
+       */
+      cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, "");
+      index = LLVMBuildSelect(builder, cc, index, c_max, "");
+   }
+
+   return index;
  }
  
  static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx)
  {
-       LLVMValueRef ptr =
-               ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
-       struct si_shader_selector *sel = ctx->shader->selector;
-
-       /* Do the bounds checking with a descriptor, because
-        * doing computation and manual bounds checking of 64-bit
-        * addresses generates horrible VALU code with very high
-        * VGPR usage and very low SIMD occupancy.
-        */
-       ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
-
-       LLVMValueRef desc0, desc1;
-       desc0 = ptr;
-       desc1 = LLVMConstInt(ctx->ac.i32,
-                            S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
-
-       uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                        S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                        S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                        S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
-       if (ctx->screen->info.chip_class >= GFX10)
-               rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                        S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-                        S_008F0C_RESOURCE_LEVEL(1);
-       else
-               rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                        S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-
-       LLVMValueRef desc_elems[] = {
-               desc0,
-               desc1,
-               LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0),
-               LLVMConstInt(ctx->ac.i32, rsrc3, false)
-       };
-
-       return ac_build_gather_values(&ctx->ac, desc_elems, 4);
+   LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
+   struct si_shader_selector *sel = ctx->shader->selector;
+
+   /* Do the bounds checking with a descriptor, because
+    * doing computation and manual bounds checking of 64-bit
+    * addresses generates horrible VALU code with very high
+    * VGPR usage and very low SIMD occupancy.
+    */
+   ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
+
+   LLVMValueRef desc0, desc1;
+   desc0 = ptr;
+   desc1 = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
+
+   uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                    S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+   if (ctx->screen->info.chip_class >= GFX10)
+      rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+               S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+   else
+      rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+   LLVMValueRef desc_elems[] = {desc0, desc1,
+                                LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0),
+                                LLVMConstInt(ctx->ac.i32, rsrc3, false)};
+
+   return ac_build_gather_values(&ctx->ac, desc_elems, 4);
  }
  
  static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader_selector *sel = ctx->shader->selector;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_selector *sel = ctx->shader->selector;
  
-       LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
+   LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
  
-       if (sel->info.const_buffers_declared == 1 &&
-           sel->info.shader_buffers_declared == 0) {
-               return load_const_buffer_desc_fast_path(ctx);
-       }
+   if (sel->info.const_buffers_declared == 1 && sel->info.shader_buffers_declared == 0) {
+      return load_const_buffer_desc_fast_path(ctx);
+   }
  
-       index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
-       index = LLVMBuildAdd(ctx->ac.builder, index,
-                            LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), "");
+   index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
+   index =
+      LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), "");
  
-       return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
+   return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
  }
  
-static LLVMValueRef
-load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
+static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac,
-                                          ctx->const_and_shader_buffers);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
  
-       index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
-       index = LLVMBuildSub(ctx->ac.builder,
-                            LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0),
-                            index, "");
+   index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
+   index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0),
+                        index, "");
  
-       return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
+   return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
  }
  
  /**
@@ -140,181 +127,167 @@ load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
   * nicer: disabling DCC in the shader still leads to undefined results but
   * avoids the lockup.
   */
-static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
-                                 LLVMValueRef rsrc)
+static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rsrc)
  {
-       if (ctx->screen->info.chip_class <= GFX7) {
-               return rsrc;
-       } else {
-               LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
-               LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0);
-               LLVMValueRef tmp;
-
-               tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
-               tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
-               return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
-       }
+   if (ctx->screen->info.chip_class <= GFX7) {
+      return rsrc;
+   } else {
+      LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
+      LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0);
+      LLVMValueRef tmp;
+
+      tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
+      tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
+      return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
+   }
  }
  
  /* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should
   * adjust "index" to point to FMASK. */
-static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx,
-                                      LLVMValueRef list, LLVMValueRef index,
-                                      enum ac_descriptor_type desc_type,
-                                      bool uses_store, bool bindless)
+static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueRef list,
+                                       LLVMValueRef index, enum ac_descriptor_type desc_type,
+                                       bool uses_store, bool bindless)
  {
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef rsrc;
-
-       if (desc_type == AC_DESC_BUFFER) {
-               index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0),
-                                     ctx->ac.i32_1);
-               list = LLVMBuildPointerCast(builder, list,
-                                           ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
-       } else {
-               assert(desc_type == AC_DESC_IMAGE ||
-                      desc_type == AC_DESC_FMASK);
-       }
-
-       if (bindless)
-               rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
-       else
-               rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
-
-       if (desc_type == AC_DESC_IMAGE && uses_store)
-               rsrc = force_dcc_off(ctx, rsrc);
-       return rsrc;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef rsrc;
+
+   if (desc_type == AC_DESC_BUFFER) {
+      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
+      list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+   } else {
+      assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK);
+   }
+
+   if (bindless)
+      rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
+   else
+      rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
+
+   if (desc_type == AC_DESC_IMAGE && uses_store)
+      rsrc = force_dcc_off(ctx, rsrc);
+   return rsrc;
  }
  
  /**
   * Load an image view, fmask view. or sampler state descriptor.
   */
-static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx,
-                                        LLVMValueRef list, LLVMValueRef index,
-                                        enum ac_descriptor_type type)
+static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, LLVMValueRef list,
+                                         LLVMValueRef index, enum ac_descriptor_type type)
  {
-       LLVMBuilderRef builder = ctx->ac.builder;
-
-       switch (type) {
-       case AC_DESC_IMAGE:
-               /* The image is at [0:7]. */
-               index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
-               break;
-       case AC_DESC_BUFFER:
-               /* The buffer is in [4:7]. */
-               index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
-                                     ctx->ac.i32_1);
-               list = LLVMBuildPointerCast(builder, list,
-                                           ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
-               break;
-       case AC_DESC_FMASK:
-               /* The FMASK is at [8:15]. */
-               index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0),
-                                     ctx->ac.i32_1);
-               break;
-       case AC_DESC_SAMPLER:
-               /* The sampler state is at [12:15]. */
-               index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
-                                     LLVMConstInt(ctx->ac.i32, 3, 0));
-               list = LLVMBuildPointerCast(builder, list,
-                                           ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
-               break;
-       case AC_DESC_PLANE_0:
-       case AC_DESC_PLANE_1:
-       case AC_DESC_PLANE_2:
-               /* Only used for the multiplane image support for Vulkan. Should
-                * never be reached in radeonsi.
-                */
-               unreachable("Plane descriptor requested in radeonsi.");
-       }
-
-       return ac_build_load_to_sgpr(&ctx->ac, list, index);
+   LLVMBuilderRef builder = ctx->ac.builder;
+
+   switch (type) {
+   case AC_DESC_IMAGE:
+      /* The image is at [0:7]. */
+      index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
+      break;
+   case AC_DESC_BUFFER:
+      /* The buffer is in [4:7]. */
+      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1);
+      list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+      break;
+   case AC_DESC_FMASK:
+      /* The FMASK is at [8:15]. */
+      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
+      break;
+   case AC_DESC_SAMPLER:
+      /* The sampler state is at [12:15]. */
+      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
+                            LLVMConstInt(ctx->ac.i32, 3, 0));
+      list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+      break;
+   case AC_DESC_PLANE_0:
+   case AC_DESC_PLANE_1:
+   case AC_DESC_PLANE_2:
+      /* Only used for the multiplane image support for Vulkan. Should
+       * never be reached in radeonsi.
+       */
+      unreachable("Plane descriptor requested in radeonsi.");
+   }
+
+   return ac_build_load_to_sgpr(&ctx->ac, list, index);
  }
  
-static LLVMValueRef
-si_nir_load_sampler_desc(struct ac_shader_abi *abi,
-                        unsigned descriptor_set, unsigned base_index,
-                        unsigned constant_index, LLVMValueRef dynamic_index,
-                        enum ac_descriptor_type desc_type, bool image,
-                        bool write, bool bindless)
+static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set,
+                                             unsigned base_index, unsigned constant_index,
+                                             LLVMValueRef dynamic_index,
+                                             enum ac_descriptor_type desc_type, bool image,
+                                             bool write, bool bindless)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       LLVMBuilderRef builder = ctx->ac.builder;
-       unsigned const_index = base_index + constant_index;
-
-       assert(!descriptor_set);
-       assert(desc_type <= AC_DESC_BUFFER);
-
-       if (bindless) {
-               LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images);
-
-               /* dynamic_index is the bindless handle */
-               if (image) {
-                       /* Bindless image descriptors use 16-dword slots. */
-                       dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index,
-                                            LLVMConstInt(ctx->ac.i64, 2, 0), "");
-                       /* FMASK is right after the image. */
-                       if (desc_type == AC_DESC_FMASK) {
-                               dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index,
-                                                            ctx->ac.i32_1, "");
-                       }
-
-                       return si_load_image_desc(ctx, list, dynamic_index, desc_type,
-                                                 write, true);
-               }
-
-               /* Since bindless handle arithmetic can contain an unsigned integer
-                * wraparound and si_load_sampler_desc assumes there isn't any,
-                * use GEP without "inbounds" (inside ac_build_pointer_add)
-                * to prevent incorrect code generation and hangs.
-                */
-               dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index,
-                                            LLVMConstInt(ctx->ac.i64, 2, 0), "");
-               list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
-               return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);
-       }
-
-       unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
-       assert(const_index < num_slots || dynamic_index);
-
-       LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images);
-       LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
-
-       if (dynamic_index) {
-               index = LLVMBuildAdd(builder, index, dynamic_index, "");
-
-               /* From the GL_ARB_shader_image_load_store extension spec:
-                *
-                *    If a shader performs an image load, store, or atomic
-                *    operation using an image variable declared as an array,
-                *    and if the index used to select an individual element is
-                *    negative or greater than or equal to the size of the
-                *    array, the results of the operation are undefined but may
-                *    not lead to termination.
-                */
-               index = si_llvm_bound_index(ctx, index, num_slots);
-       }
-
-       if (image) {
-               /* FMASKs are separate from images. */
-               if (desc_type == AC_DESC_FMASK) {
-                       index = LLVMBuildAdd(ctx->ac.builder, index,
-                                            LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), "");
-               }
-               index = LLVMBuildSub(ctx->ac.builder,
-                                    LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0),
-                                    index, "");
-               return si_load_image_desc(ctx, list, index, desc_type, write, false);
-       }
-
-       index = LLVMBuildAdd(ctx->ac.builder, index,
-                            LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");
-       return si_load_sampler_desc(ctx, list, index, desc_type);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   unsigned const_index = base_index + constant_index;
+
+   assert(!descriptor_set);
+   assert(desc_type <= AC_DESC_BUFFER);
+
+   if (bindless) {
+      LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images);
+
+      /* dynamic_index is the bindless handle */
+      if (image) {
+         /* Bindless image descriptors use 16-dword slots. */
+         dynamic_index =
+            LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
+         /* FMASK is right after the image. */
+         if (desc_type == AC_DESC_FMASK) {
+            dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, ctx->ac.i32_1, "");
+         }
+
+         return si_load_image_desc(ctx, list, dynamic_index, desc_type, write, true);
+      }
+
+      /* Since bindless handle arithmetic can contain an unsigned integer
+       * wraparound and si_load_sampler_desc assumes there isn't any,
+       * use GEP without "inbounds" (inside ac_build_pointer_add)
+       * to prevent incorrect code generation and hangs.
+       */
+      dynamic_index =
+         LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
+      list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
+      return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);
+   }
+
+   unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
+   assert(const_index < num_slots || dynamic_index);
+
+   LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images);
+   LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
+
+   if (dynamic_index) {
+      index = LLVMBuildAdd(builder, index, dynamic_index, "");
+
+      /* From the GL_ARB_shader_image_load_store extension spec:
+       *
+       *    If a shader performs an image load, store, or atomic
+       *    operation using an image variable declared as an array,
+       *    and if the index used to select an individual element is
+       *    negative or greater than or equal to the size of the
+       *    array, the results of the operation are undefined but may
+       *    not lead to termination.
+       */
+      index = si_llvm_bound_index(ctx, index, num_slots);
+   }
+
+   if (image) {
+      /* FMASKs are separate from images. */
+      if (desc_type == AC_DESC_FMASK) {
+         index =
+            LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), "");
+      }
+      index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0),
+                           index, "");
+      return si_load_image_desc(ctx, list, index, desc_type, write, false);
+   }
+
+   index = LLVMBuildAdd(ctx->ac.builder, index,
+                        LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");
+   return si_load_sampler_desc(ctx, list, index, desc_type);
  }
  
  void si_llvm_init_resource_callbacks(struct si_shader_context *ctx)
  {
-       ctx->abi.load_ubo = load_ubo;
-       ctx->abi.load_ssbo = load_ssbo;
-       ctx->abi.load_sampler_desc = si_nir_load_sampler_desc;
+   ctx->abi.load_ubo = load_ubo;
+   ctx->abi.load_ssbo = load_ssbo;
+   ctx->abi.load_sampler_desc = si_nir_load_sampler_desc;
  }
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c

index 116e06e5af1e4334147694a7f13811970f48319f..5dba9859988b25912ee2688ec3ea089a2d89672c 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
@@ -22,23 +22,23 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "si_shader_internal.h"
  #include "si_pipe.h"
+#include "si_shader_internal.h"
  #include "sid.h"
  
  static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
  {
-       switch (ctx->type) {
-       case PIPE_SHADER_TESS_CTRL:
-               return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8);
+   switch (ctx->type) {
+   case PIPE_SHADER_TESS_CTRL:
+      return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8);
  
-       case PIPE_SHADER_TESS_EVAL:
-               return ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id);
+   case PIPE_SHADER_TESS_EVAL:
+      return ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id);
  
-       default:
-               assert(0);
-               return NULL;
-       }
+   default:
+      assert(0);
+      return NULL;
+   }
  }
  
  /* Tessellation shaders pass outputs to the next shader using LDS.
@@ -62,151 +62,134 @@ static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
   * All three shaders VS(LS), TCS, TES share the same LDS space.
   */
  
-static LLVMValueRef
-get_tcs_in_patch_stride(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_in_patch_stride(struct si_shader_context *ctx)
  {
-       return si_unpack_param(ctx, ctx->vs_state_bits, 11, 13);
+   return si_unpack_param(ctx, ctx->vs_state_bits, 11, 13);
  }
  
  static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
  {
-       assert(ctx->type == PIPE_SHADER_TESS_CTRL);
+   assert(ctx->type == PIPE_SHADER_TESS_CTRL);
  
-       if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
-               return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
+   if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
+      return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
  
-       return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
+   return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
  }
  
  static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
  {
-       unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
+   unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
  
-       return LLVMConstInt(ctx->ac.i32, stride, 0);
+   return LLVMConstInt(ctx->ac.i32, stride, 0);
  }
  
  static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
  {
-       if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
-               return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13);
-
-       const struct si_shader_info *info = &ctx->shader->selector->info;
-       unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
-       unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
-       unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
-       unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride +
-                                  num_patch_outputs * 4;
-       return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0);
+   if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
+      return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13);
+
+   const struct si_shader_info *info = &ctx->shader->selector->info;
+   unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+   unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
+   unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
+   unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + num_patch_outputs * 4;
+   return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0);
  }
  
-static LLVMValueRef
-get_tcs_out_patch0_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_patch0_offset(struct si_shader_context *ctx)
  {
-       return LLVMBuildMul(ctx->ac.builder,
-                           si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16),
-                           LLVMConstInt(ctx->ac.i32, 4, 0), "");
+   return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16),
+                       LLVMConstInt(ctx->ac.i32, 4, 0), "");
  }
  
-static LLVMValueRef
-get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
  {
-       return LLVMBuildMul(ctx->ac.builder,
-                           si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16),
-                           LLVMConstInt(ctx->ac.i32, 4, 0), "");
+   return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16),
+                       LLVMConstInt(ctx->ac.i32, 4, 0), "");
  }
  
-static LLVMValueRef
-get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
  {
-       LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
-       LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
+   LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
+   LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
  
-       return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
+   return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
  }
  
-static LLVMValueRef
-get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
  {
-       LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
-       LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
-       LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
+   LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
+   LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
+   LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
  
-       return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
+   return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
  }
  
-static LLVMValueRef
-get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
  {
-       LLVMValueRef patch0_patch_data_offset =
-               get_tcs_out_patch0_patch_data_offset(ctx);
-       LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
-       LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
+   LLVMValueRef patch0_patch_data_offset = get_tcs_out_patch0_patch_data_offset(ctx);
+   LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
+   LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
  
-       return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
+   return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
  }
  
  static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
  {
-       unsigned tcs_out_vertices =
-               ctx->shader->selector ?
-               ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;
+   unsigned tcs_out_vertices =
+      ctx->shader->selector ? ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT]
+                            : 0;
  
-       /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
-       if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
-               return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0);
+   /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
+   if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
+      return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0);
  
-       return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6);
+   return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6);
  }
  
  static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
  {
-       unsigned stride;
-
-       switch (ctx->type) {
-       case PIPE_SHADER_VERTEX:
-               stride = ctx->shader->selector->lshs_vertex_stride / 4;
-               return LLVMConstInt(ctx->ac.i32, stride, 0);
-
-       case PIPE_SHADER_TESS_CTRL:
-               if (ctx->screen->info.chip_class >= GFX9 &&
-                   ctx->shader->is_monolithic) {
-                       stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
-                       return LLVMConstInt(ctx->ac.i32, stride, 0);
-               }
-               return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8);
-
-       default:
-               assert(0);
-               return NULL;
-       }
+   unsigned stride;
+
+   switch (ctx->type) {
+   case PIPE_SHADER_VERTEX:
+      stride = ctx->shader->selector->lshs_vertex_stride / 4;
+      return LLVMConstInt(ctx->ac.i32, stride, 0);
+
+   case PIPE_SHADER_TESS_CTRL:
+      if (ctx->screen->info.chip_class >= GFX9 && ctx->shader->is_monolithic) {
+         stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
+         return LLVMConstInt(ctx->ac.i32, stride, 0);
+      }
+      return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8);
+
+   default:
+      assert(0);
+      return NULL;
+   }
  }
  
-static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx,
-                                                       LLVMValueRef vertex_dw_stride,
-                                                       LLVMValueRef base_addr,
-                                                       LLVMValueRef vertex_index,
-                                                       LLVMValueRef param_index,
-                                                       ubyte name, ubyte index)
+static LLVMValueRef
+get_dw_address_from_generic_indices(struct si_shader_context *ctx, LLVMValueRef vertex_dw_stride,
+                                    LLVMValueRef base_addr, LLVMValueRef vertex_index,
+                                    LLVMValueRef param_index, ubyte name, ubyte index)
  {
-       if (vertex_dw_stride) {
-               base_addr = ac_build_imad(&ctx->ac, vertex_index,
-                                         vertex_dw_stride, base_addr);
-       }
-
-       if (param_index) {
-               base_addr = ac_build_imad(&ctx->ac, param_index,
-                                         LLVMConstInt(ctx->ac.i32, 4, 0), base_addr);
-       }
-
-       int param = name == TGSI_SEMANTIC_PATCH ||
-                   name == TGSI_SEMANTIC_TESSINNER ||
-                   name == TGSI_SEMANTIC_TESSOUTER ?
-               si_shader_io_get_unique_index_patch(name, index) :
-               si_shader_io_get_unique_index(name, index, false);
-
-       /* Add the base address of the element. */
-       return LLVMBuildAdd(ctx->ac.builder, base_addr,
-                           LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
+   if (vertex_dw_stride) {
+      base_addr = ac_build_imad(&ctx->ac, vertex_index, vertex_dw_stride, base_addr);
+   }
+
+   if (param_index) {
+      base_addr = ac_build_imad(&ctx->ac, param_index, LLVMConstInt(ctx->ac.i32, 4, 0), base_addr);
+   }
+
+   int param = name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+                     name == TGSI_SEMANTIC_TESSOUTER
+                  ? si_shader_io_get_unique_index_patch(name, index)
+                  : si_shader_io_get_unique_index(name, index, false);
+
+   /* Add the base address of the element. */
+   return LLVMBuildAdd(ctx->ac.builder, base_addr, LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
  }
  
  /* The offchip buffer layout for TCS->TES is
@@ -228,98 +211,88 @@ static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context
   * Note that every attribute has 4 components.
   */
  static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
-                                              LLVMValueRef rel_patch_id,
-                                               LLVMValueRef vertex_index,
+                                               LLVMValueRef rel_patch_id, LLVMValueRef vertex_index,
                                                 LLVMValueRef param_index)
  {
-       LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
-       LLVMValueRef param_stride, constant16;
-
-       vertices_per_patch = get_num_tcs_out_vertices(ctx);
-       num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
-       total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
-                                     num_patches, "");
-
-       constant16 = LLVMConstInt(ctx->ac.i32, 16, 0);
-       if (vertex_index) {
-               base_addr = ac_build_imad(&ctx->ac, rel_patch_id,
-                                         vertices_per_patch, vertex_index);
-               param_stride = total_vertices;
-       } else {
-               base_addr = rel_patch_id;
-               param_stride = num_patches;
-       }
-
-       base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
-       base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
-
-       if (!vertex_index) {
-               LLVMValueRef patch_data_offset =
-                          si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20);
-
-               base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
-                                        patch_data_offset, "");
-       }
-       return base_addr;
+   LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
+   LLVMValueRef param_stride, constant16;
+
+   vertices_per_patch = get_num_tcs_out_vertices(ctx);
+   num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
+   total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, num_patches, "");
+
+   constant16 = LLVMConstInt(ctx->ac.i32, 16, 0);
+   if (vertex_index) {
+      base_addr = ac_build_imad(&ctx->ac, rel_patch_id, vertices_per_patch, vertex_index);
+      param_stride = total_vertices;
+   } else {
+      base_addr = rel_patch_id;
+      param_stride = num_patches;
+   }
+
+   base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
+   base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
+
+   if (!vertex_index) {
+      LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20);
+
+      base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, patch_data_offset, "");
+   }
+   return base_addr;
  }
  
-static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(
-                                       struct si_shader_context *ctx,
-                                       LLVMValueRef vertex_index,
-                                       LLVMValueRef param_index,
-                                       ubyte name, ubyte index)
+static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(struct si_shader_context *ctx,
+                                                                    LLVMValueRef vertex_index,
+                                                                    LLVMValueRef param_index,
+                                                                    ubyte name, ubyte index)
  {
-       unsigned param_index_base;
-
-       param_index_base = name == TGSI_SEMANTIC_PATCH ||
-                          name == TGSI_SEMANTIC_TESSINNER ||
-                          name == TGSI_SEMANTIC_TESSOUTER ?
-               si_shader_io_get_unique_index_patch(name, index) :
-               si_shader_io_get_unique_index(name, index, false);
-
-       if (param_index) {
-               param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
-                                          LLVMConstInt(ctx->ac.i32, param_index_base, 0),
-                                          "");
-       } else {
-               param_index = LLVMConstInt(ctx->ac.i32, param_index_base, 0);
-       }
-
-       return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
-                                         vertex_index, param_index);
+   unsigned param_index_base;
+
+   param_index_base = name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+                            name == TGSI_SEMANTIC_TESSOUTER
+                         ? si_shader_io_get_unique_index_patch(name, index)
+                         : si_shader_io_get_unique_index(name, index, false);
+
+   if (param_index) {
+      param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
+                                 LLVMConstInt(ctx->ac.i32, param_index_base, 0), "");
+   } else {
+      param_index = LLVMConstInt(ctx->ac.i32, param_index_base, 0);
+   }
+
+   return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), vertex_index, param_index);
  }
  
-static LLVMValueRef buffer_load(struct si_shader_context *ctx,
-                                LLVMTypeRef type, unsigned swizzle,
-                                LLVMValueRef buffer, LLVMValueRef offset,
-                                LLVMValueRef base, bool can_speculate)
+static LLVMValueRef buffer_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle,
+                                LLVMValueRef buffer, LLVMValueRef offset, LLVMValueRef base,
+                                bool can_speculate)
  {
-       LLVMValueRef value, value2;
-       LLVMTypeRef vec_type = LLVMVectorType(type, 4);
+   LLVMValueRef value, value2;
+   LLVMTypeRef vec_type = LLVMVectorType(type, 4);
  
-       if (swizzle == ~0) {
-               value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
-                                            0, ac_glc, can_speculate, false);
+   if (swizzle == ~0) {
+      value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, ac_glc,
+                                   can_speculate, false);
  
-               return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
-       }
+      return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
+   }
  
-       if (ac_get_type_size(type) != 8) {
-               value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
-                                            0, ac_glc, can_speculate, false);
+   if (ac_get_type_size(type) != 8) {
+      value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, ac_glc,
+                                   can_speculate, false);
  
-               value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
-               return LLVMBuildExtractElement(ctx->ac.builder, value,
-                                   LLVMConstInt(ctx->ac.i32, swizzle, 0), "");
-       }
+      value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
+      return LLVMBuildExtractElement(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, swizzle, 0),
+                                     "");
+   }
  
-       value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
-                                 swizzle * 4, ac_glc, can_speculate, false);
+   value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, swizzle * 4, ac_glc,
+                                can_speculate, false);
  
-       value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
-                                  swizzle * 4 + 4, ac_glc, can_speculate, false);
+   value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, swizzle * 4 + 4, ac_glc,
+                                 can_speculate, false);
  
-       return si_build_gather_64bit(ctx, type, value, value2);
+   return si_build_gather_64bit(ctx, type, value, value2);
  }
  
  /**
@@ -329,36 +302,34 @@ static LLVMValueRef buffer_load(struct si_shader_context *ctx,
   * \param swizzle      offset (typically 0..3); it can be ~0, which loads a vec4
   * \param dw_addr      address in dwords
   */
-static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx,
-                                 LLVMTypeRef type, unsigned swizzle,
-                                 LLVMValueRef dw_addr)
+static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle,
+                                  LLVMValueRef dw_addr)
  {
-       LLVMValueRef value;
+   LLVMValueRef value;
  
-       if (swizzle == ~0) {
-               LLVMValueRef values[4];
+   if (swizzle == ~0) {
+      LLVMValueRef values[4];
  
-               for (unsigned chan = 0; chan < 4; chan++)
-                       values[chan] = lshs_lds_load(ctx, type, chan, dw_addr);
+      for (unsigned chan = 0; chan < 4; chan++)
+         values[chan] = lshs_lds_load(ctx, type, chan, dw_addr);
  
-               return ac_build_gather_values(&ctx->ac, values, 4);
-       }
+      return ac_build_gather_values(&ctx->ac, values, 4);
+   }
  
-       /* Split 64-bit loads. */
-       if (ac_get_type_size(type) == 8) {
-               LLVMValueRef lo, hi;
+   /* Split 64-bit loads. */
+   if (ac_get_type_size(type) == 8) {
+      LLVMValueRef lo, hi;
  
-               lo = lshs_lds_load(ctx, ctx->ac.i32, swizzle, dw_addr);
-               hi = lshs_lds_load(ctx, ctx->ac.i32, swizzle + 1, dw_addr);
-               return si_build_gather_64bit(ctx, type, lo, hi);
-       }
+      lo = lshs_lds_load(ctx, ctx->ac.i32, swizzle, dw_addr);
+      hi = lshs_lds_load(ctx, ctx->ac.i32, swizzle + 1, dw_addr);
+      return si_build_gather_64bit(ctx, type, lo, hi);
+   }
  
-       dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
-                              LLVMConstInt(ctx->ac.i32, swizzle, 0), "");
+   dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, swizzle, 0), "");
  
-       value = ac_lds_load(&ctx->ac, dw_addr);
+   value = ac_lds_load(&ctx->ac, dw_addr);
  
-       return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
+   return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
  }
  
  /**
@@ -368,423 +339,367 @@ static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx,
   * \param dw_addr      address in dwords
   * \param value                value to store
   */
-static void lshs_lds_store(struct si_shader_context *ctx,
-                     unsigned dw_offset_imm, LLVMValueRef dw_addr,
-                     LLVMValueRef value)
+static void lshs_lds_store(struct si_shader_context *ctx, unsigned dw_offset_imm,
+                           LLVMValueRef dw_addr, LLVMValueRef value)
  {
-       dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
-                              LLVMConstInt(ctx->ac.i32, dw_offset_imm, 0), "");
+   dw_addr =
+      LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, dw_offset_imm, 0), "");
  
-       ac_lds_store(&ctx->ac, dw_addr, value);
+   ac_lds_store(&ctx->ac, dw_addr, value);
  }
  
-enum si_tess_ring {
-       TCS_FACTOR_RING,
-       TESS_OFFCHIP_RING_TCS,
-       TESS_OFFCHIP_RING_TES,
+enum si_tess_ring
+{
+   TCS_FACTOR_RING,
+   TESS_OFFCHIP_RING_TCS,
+   TESS_OFFCHIP_RING_TES,
  };
  
-static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx,
-                                            enum si_tess_ring ring)
+static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx, enum si_tess_ring ring)
  {
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef addr = ac_get_arg(&ctx->ac,
-                                      ring == TESS_OFFCHIP_RING_TES ?
-                                      ctx->tes_offchip_addr :
-                                      ctx->tcs_out_lds_layout);
-
-       /* TCS only receives high 13 bits of the address. */
-       if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
-               addr = LLVMBuildAnd(builder, addr,
-                                   LLVMConstInt(ctx->ac.i32, 0xfff80000, 0), "");
-       }
-
-       if (ring == TCS_FACTOR_RING) {
-               unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
-               addr = LLVMBuildAdd(builder, addr,
-                                   LLVMConstInt(ctx->ac.i32, tf_offset, 0), "");
-       }
-
-       uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                        S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                        S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                        S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
-       if (ctx->screen->info.chip_class >= GFX10)
-               rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                        S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-                        S_008F0C_RESOURCE_LEVEL(1);
-       else
-               rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                        S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-
-       LLVMValueRef desc[4];
-       desc[0] = addr;
-       desc[1] = LLVMConstInt(ctx->ac.i32,
-                              S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
-       desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0);
-       desc[3] = LLVMConstInt(ctx->ac.i32, rsrc3, false);
-
-       return ac_build_gather_values(&ctx->ac, desc, 4);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef addr = ac_get_arg(
+      &ctx->ac, ring == TESS_OFFCHIP_RING_TES ? ctx->tes_offchip_addr : ctx->tcs_out_lds_layout);
+
+   /* TCS only receives high 13 bits of the address. */
+   if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
+      addr = LLVMBuildAnd(builder, addr, LLVMConstInt(ctx->ac.i32, 0xfff80000, 0), "");
+   }
+
+   if (ring == TCS_FACTOR_RING) {
+      unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
+      addr = LLVMBuildAdd(builder, addr, LLVMConstInt(ctx->ac.i32, tf_offset, 0), "");
+   }
+
+   uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                    S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+   if (ctx->screen->info.chip_class >= GFX10)
+      rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+               S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+   else
+      rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+   LLVMValueRef desc[4];
+   desc[0] = addr;
+   desc[1] = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
+   desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0);
+   desc[3] = LLVMConstInt(ctx->ac.i32, rsrc3, false);
+
+   return ac_build_gather_values(&ctx->ac, desc, 4);
  }
  
  void si_llvm_preload_tes_rings(struct si_shader_context *ctx)
  {
-       ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
+   ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
  }
  
-static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
-                                            LLVMTypeRef type,
-                                            LLVMValueRef vertex_index,
-                                            LLVMValueRef param_index,
-                                            unsigned const_index,
-                                            unsigned location,
-                                            unsigned driver_location,
-                                            unsigned component,
-                                            unsigned num_components,
-                                            bool is_patch,
-                                            bool is_compact,
-                                            bool load_input)
+static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, LLVMTypeRef type,
+                                             LLVMValueRef vertex_index, LLVMValueRef param_index,
+                                             unsigned const_index, unsigned location,
+                                             unsigned driver_location, unsigned component,
+                                             unsigned num_components, bool is_patch,
+                                             bool is_compact, bool load_input)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader_info *info = &ctx->shader->selector->info;
-       LLVMValueRef dw_addr, stride;
-       ubyte name, index;
-
-       driver_location = driver_location / 4;
-
-       if (load_input) {
-               name = info->input_semantic_name[driver_location];
-               index = info->input_semantic_index[driver_location];
-       } else {
-               name = info->output_semantic_name[driver_location];
-               index = info->output_semantic_index[driver_location];
-       }
-
-       assert((name == TGSI_SEMANTIC_PATCH ||
-               name == TGSI_SEMANTIC_TESSINNER ||
-               name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
-
-       if (load_input) {
-               stride = get_tcs_in_vertex_dw_stride(ctx);
-               dw_addr = get_tcs_in_current_patch_offset(ctx);
-       } else {
-               if (is_patch) {
-                       stride = NULL;
-                       dw_addr = get_tcs_out_current_patch_data_offset(ctx);
-               } else {
-                       stride = get_tcs_out_vertex_dw_stride(ctx);
-                       dw_addr = get_tcs_out_current_patch_offset(ctx);
-               }
-       }
-
-       if (!param_index) {
-               param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
-       }
-
-       dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
-                                                     vertex_index, param_index,
-                                                     name, index);
-
-       LLVMValueRef value[4];
-       for (unsigned i = 0; i < num_components; i++) {
-               unsigned offset = i;
-               if (ac_get_type_size(type) == 8)
-                       offset *= 2;
-
-               offset += component;
-               value[i + component] = lshs_lds_load(ctx, type, offset, dw_addr);
-       }
-
-       return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   LLVMValueRef dw_addr, stride;
+   ubyte name, index;
+
+   driver_location = driver_location / 4;
+
+   if (load_input) {
+      name = info->input_semantic_name[driver_location];
+      index = info->input_semantic_index[driver_location];
+   } else {
+      name = info->output_semantic_name[driver_location];
+      index = info->output_semantic_index[driver_location];
+   }
+
+   assert((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+           name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
+   if (load_input) {
+      stride = get_tcs_in_vertex_dw_stride(ctx);
+      dw_addr = get_tcs_in_current_patch_offset(ctx);
+   } else {
+      if (is_patch) {
+         stride = NULL;
+         dw_addr = get_tcs_out_current_patch_data_offset(ctx);
+      } else {
+         stride = get_tcs_out_vertex_dw_stride(ctx);
+         dw_addr = get_tcs_out_current_patch_offset(ctx);
+      }
+   }
+
+   if (!param_index) {
+      param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
+   }
+
+   dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
+                                                 name, index);
+
+   LLVMValueRef value[4];
+   for (unsigned i = 0; i < num_components; i++) {
+      unsigned offset = i;
+      if (ac_get_type_size(type) == 8)
+         offset *= 2;
+
+      offset += component;
+      value[i + component] = lshs_lds_load(ctx, type, offset, dw_addr);
+   }
+
+   return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
  }
  
-static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
-                                         LLVMTypeRef type,
-                                         LLVMValueRef vertex_index,
-                                         LLVMValueRef param_index,
-                                         unsigned const_index,
-                                         unsigned location,
-                                         unsigned driver_location,
-                                         unsigned component,
-                                         unsigned num_components,
-                                         bool is_patch,
-                                         bool is_compact,
-                                         bool load_input)
+static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, LLVMTypeRef type,
+                                          LLVMValueRef vertex_index, LLVMValueRef param_index,
+                                          unsigned const_index, unsigned location,
+                                          unsigned driver_location, unsigned component,
+                                          unsigned num_components, bool is_patch, bool is_compact,
+                                          bool load_input)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader_info *info = &ctx->shader->selector->info;
-       LLVMValueRef base, addr;
-
-       driver_location = driver_location / 4;
-       ubyte name = info->input_semantic_name[driver_location];
-       ubyte index = info->input_semantic_index[driver_location];
-
-       assert((name == TGSI_SEMANTIC_PATCH ||
-               name == TGSI_SEMANTIC_TESSINNER ||
-               name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
-
-       base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-
-       if (!param_index) {
-               param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
-       }
-
-       addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
-                                                              param_index,
-                                                              name, index);
-
-       /* TODO: This will generate rather ordinary llvm code, although it
-        * should be easy for the optimiser to fix up. In future we might want
-        * to refactor buffer_load().
-        */
-       LLVMValueRef value[4];
-       for (unsigned i = 0; i < num_components; i++) {
-               unsigned offset = i;
-               if (ac_get_type_size(type) == 8) {
-                       offset *= 2;
-                       if (offset == 4) {
-                               ubyte name = info->input_semantic_name[driver_location + 1];
-                               ubyte index = info->input_semantic_index[driver_location + 1];
-                                addr = get_tcs_tes_buffer_address_from_generic_indices(ctx,
-                                                                                       vertex_index,
-                                                                                       param_index,
-                                                                                      name, index);
-                       }
-
-                        offset = offset % 4;
-               }
-
-               offset += component;
-               value[i + component] = buffer_load(ctx, type, offset,
-                                                  ctx->tess_offchip_ring, base, addr, true);
-       }
-
-       return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   LLVMValueRef base, addr;
+
+   driver_location = driver_location / 4;
+   ubyte name = info->input_semantic_name[driver_location];
+   ubyte index = info->input_semantic_index[driver_location];
+
+   assert((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+           name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
+   base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+
+   if (!param_index) {
+      param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
+   }
+
+   addr =
+      get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, name, index);
+
+   /* TODO: This will generate rather ordinary llvm code, although it
+    * should be easy for the optimiser to fix up. In future we might want
+    * to refactor buffer_load().
+    */
+   LLVMValueRef value[4];
+   for (unsigned i = 0; i < num_components; i++) {
+      unsigned offset = i;
+      if (ac_get_type_size(type) == 8) {
+         offset *= 2;
+         if (offset == 4) {
+            ubyte name = info->input_semantic_name[driver_location + 1];
+            ubyte index = info->input_semantic_index[driver_location + 1];
+            addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index,
+                                                                   name, index);
+         }
+
+         offset = offset % 4;
+      }
+
+      offset += component;
+      value[i + component] =
+         buffer_load(ctx, type, offset, ctx->tess_offchip_ring, base, addr, true);
+   }
+
+   return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
  }
  
-static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
-                                   const struct nir_variable *var,
-                                   LLVMValueRef vertex_index,
-                                   LLVMValueRef param_index,
-                                   unsigned const_index,
-                                   LLVMValueRef src,
-                                   unsigned writemask)
+static void si_nir_store_output_tcs(struct ac_shader_abi *abi, const struct nir_variable *var,
+                                    LLVMValueRef vertex_index, LLVMValueRef param_index,
+                                    unsigned const_index, LLVMValueRef src, unsigned writemask)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader_info *info = &ctx->shader->selector->info;
-       const unsigned component = var->data.location_frac;
-       unsigned driver_location = var->data.driver_location;
-       LLVMValueRef dw_addr, stride;
-       LLVMValueRef buffer, base, addr;
-       LLVMValueRef values[8];
-       bool skip_lds_store;
-       bool is_tess_factor = false, is_tess_inner = false;
-
-       driver_location = driver_location / 4;
-       ubyte name = info->output_semantic_name[driver_location];
-       ubyte index = info->output_semantic_index[driver_location];
-
-       bool is_const = !param_index;
-       if (!param_index)
-               param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
-
-       const bool is_patch = var->data.patch ||
-                             var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
-                             var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;
-
-       /* Invalid SPIR-V can cause this. */
-       if ((name == TGSI_SEMANTIC_PATCH ||
-            name == TGSI_SEMANTIC_TESSINNER ||
-            name == TGSI_SEMANTIC_TESSOUTER) != is_patch)
-               return;
-
-       if (!is_patch) {
-               stride = get_tcs_out_vertex_dw_stride(ctx);
-               dw_addr = get_tcs_out_current_patch_offset(ctx);
-               dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
-                                                             vertex_index, param_index,
-                                                             name, index);
-
-               skip_lds_store = !info->reads_pervertex_outputs;
-       } else {
-               dw_addr = get_tcs_out_current_patch_data_offset(ctx);
-               dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr,
-                                                             vertex_index, param_index,
-                                                             name, index);
-
-               skip_lds_store = !info->reads_perpatch_outputs;
-
-               if (is_const && const_index == 0) {
-                       int name = info->output_semantic_name[driver_location];
-
-                       /* Always write tess factors into LDS for the TCS epilog. */
-                       if (name == TGSI_SEMANTIC_TESSINNER ||
-                           name == TGSI_SEMANTIC_TESSOUTER) {
-                               /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
-                               skip_lds_store = !info->reads_tessfactor_outputs &&
-                                                ctx->shader->selector->info.tessfactors_are_def_in_all_invocs;
-                               is_tess_factor = true;
-                               is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
-                       }
-               }
-       }
-
-       buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
-
-       base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-
-       addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
-                                                              param_index, name, index);
-
-       for (unsigned chan = component; chan < 8; chan++) {
-               if (!(writemask & (1 << chan)))
-                       continue;
-               LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
-
-               unsigned buffer_store_offset = chan % 4;
-               if (chan == 4) {
-                       ubyte name = info->output_semantic_name[driver_location + 1];
-                       ubyte index = info->output_semantic_index[driver_location + 1];
-                        addr = get_tcs_tes_buffer_address_from_generic_indices(ctx,
-                                                                               vertex_index,
-                                                                               param_index,
-                                                                              name, index);
-               }
-
-               /* Skip LDS stores if there is no LDS read of this output. */
-               if (!skip_lds_store)
-                       lshs_lds_store(ctx, chan, dw_addr, value);
-
-               value = ac_to_integer(&ctx->ac, value);
-               values[chan] = value;
-
-               if (writemask != 0xF && !is_tess_factor) {
-                       ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
-                                                   addr, base,
-                                                   4 * buffer_store_offset,
-                                                    ac_glc);
-               }
-
-               /* Write tess factors into VGPRs for the epilog. */
-               if (is_tess_factor &&
-                   ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
-                       if (!is_tess_inner) {
-                               LLVMBuildStore(ctx->ac.builder, value, /* outer */
-                                              ctx->invoc0_tess_factors[chan]);
-                       } else if (chan < 2) {
-                               LLVMBuildStore(ctx->ac.builder, value, /* inner */
-                                              ctx->invoc0_tess_factors[4 + chan]);
-                       }
-               }
-       }
-
-       if (writemask == 0xF && !is_tess_factor) {
-               LLVMValueRef value = ac_build_gather_values(&ctx->ac,
-                                                           values, 4);
-               ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr,
-                                           base, 0, ac_glc);
-       }
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   const unsigned component = var->data.location_frac;
+   unsigned driver_location = var->data.driver_location;
+   LLVMValueRef dw_addr, stride;
+   LLVMValueRef buffer, base, addr;
+   LLVMValueRef values[8];
+   bool skip_lds_store;
+   bool is_tess_factor = false, is_tess_inner = false;
+
+   driver_location = driver_location / 4;
+   ubyte name = info->output_semantic_name[driver_location];
+   ubyte index = info->output_semantic_index[driver_location];
+
+   bool is_const = !param_index;
+   if (!param_index)
+      param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
+
+   const bool is_patch = var->data.patch || var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
+                         var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;
+
+   /* Invalid SPIR-V can cause this. */
+   if ((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+        name == TGSI_SEMANTIC_TESSOUTER) != is_patch)
+      return;
+
+   if (!is_patch) {
+      stride = get_tcs_out_vertex_dw_stride(ctx);
+      dw_addr = get_tcs_out_current_patch_offset(ctx);
+      dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
+                                                    name, index);
+
+      skip_lds_store = !info->reads_pervertex_outputs;
+   } else {
+      dw_addr = get_tcs_out_current_patch_data_offset(ctx);
+      dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, vertex_index, param_index,
+                                                    name, index);
+
+      skip_lds_store = !info->reads_perpatch_outputs;
+
+      if (is_const && const_index == 0) {
+         int name = info->output_semantic_name[driver_location];
+
+         /* Always write tess factors into LDS for the TCS epilog. */
+         if (name == TGSI_SEMANTIC_TESSINNER || name == TGSI_SEMANTIC_TESSOUTER) {
+            /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
+            skip_lds_store = !info->reads_tessfactor_outputs &&
+                             ctx->shader->selector->info.tessfactors_are_def_in_all_invocs;
+            is_tess_factor = true;
+            is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
+         }
+      }
+   }
+
+   buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
+
+   base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+
+   addr =
+      get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, name, index);
+
+   for (unsigned chan = component; chan < 8; chan++) {
+      if (!(writemask & (1 << chan)))
+         continue;
+      LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
+
+      unsigned buffer_store_offset = chan % 4;
+      if (chan == 4) {
+         ubyte name = info->output_semantic_name[driver_location + 1];
+         ubyte index = info->output_semantic_index[driver_location + 1];
+         addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index,
+                                                                name, index);
+      }
+
+      /* Skip LDS stores if there is no LDS read of this output. */
+      if (!skip_lds_store)
+         lshs_lds_store(ctx, chan, dw_addr, value);
+
+      value = ac_to_integer(&ctx->ac, value);
+      values[chan] = value;
+
+      if (writemask != 0xF && !is_tess_factor) {
+         ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, addr, base,
+                                     4 * buffer_store_offset, ac_glc);
+      }
+
+      /* Write tess factors into VGPRs for the epilog. */
+      if (is_tess_factor && ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
+         if (!is_tess_inner) {
+            LLVMBuildStore(ctx->ac.builder, value, /* outer */
+                           ctx->invoc0_tess_factors[chan]);
+         } else if (chan < 2) {
+            LLVMBuildStore(ctx->ac.builder, value, /* inner */
+                           ctx->invoc0_tess_factors[4 + chan]);
+         }
+      }
+   }
+
+   if (writemask == 0xF && !is_tess_factor) {
+      LLVMValueRef value = ac_build_gather_values(&ctx->ac, values, 4);
+      ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr, base, 0, ac_glc);
+   }
  }
  
  static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       LLVMValueRef coord[4] = {
-               ac_get_arg(&ctx->ac, ctx->tes_u),
-               ac_get_arg(&ctx->ac, ctx->tes_v),
-               ctx->ac.f32_0,
-               ctx->ac.f32_0
-       };
-
-       /* For triangles, the vector should be (u, v, 1-u-v). */
-       if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
-           PIPE_PRIM_TRIANGLES) {
-               coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
-                                        LLVMBuildFAdd(ctx->ac.builder,
-                                                      coord[0], coord[1], ""), "");
-       }
-       return ac_build_gather_values(&ctx->ac, coord, 4);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMValueRef coord[4] = {ac_get_arg(&ctx->ac, ctx->tes_u), ac_get_arg(&ctx->ac, ctx->tes_v),
+                            ctx->ac.f32_0, ctx->ac.f32_0};
+
+   /* For triangles, the vector should be (u, v, 1-u-v). */
+   if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_TRIANGLES) {
+      coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
+                               LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), "");
+   }
+   return ac_build_gather_values(&ctx->ac, coord, 4);
  }
  
-static LLVMValueRef load_tess_level(struct si_shader_context *ctx,
-                                   unsigned semantic_name)
+static LLVMValueRef load_tess_level(struct si_shader_context *ctx, unsigned semantic_name)
  {
-       LLVMValueRef base, addr;
-
-       int param = si_shader_io_get_unique_index_patch(semantic_name, 0);
+   LLVMValueRef base, addr;
  
-       base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-       addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
-                                         LLVMConstInt(ctx->ac.i32, param, 0));
+   int param = si_shader_io_get_unique_index_patch(semantic_name, 0);
  
-       return buffer_load(ctx, ctx->ac.f32,
-                          ~0, ctx->tess_offchip_ring, base, addr, true);
+   base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+   addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
+                                     LLVMConstInt(ctx->ac.i32, param, 0));
  
+   return buffer_load(ctx, ctx->ac.f32, ~0, ctx->tess_offchip_ring, base, addr, true);
  }
  
-static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx,
-                                           unsigned semantic_name)
+static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx, unsigned semantic_name)
  {
-       LLVMValueRef buf, slot, val[4];
-       int i, offset;
-
-       slot = LLVMConstInt(ctx->ac.i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
-       buf = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-       buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
-       offset = semantic_name == TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL ? 4 : 0;
-
-       for (i = 0; i < 4; i++)
-               val[i] = si_buffer_load_const(ctx, buf,
-                                             LLVMConstInt(ctx->ac.i32, (offset + i) * 4, 0));
-       return ac_build_gather_values(&ctx->ac, val, 4);
+   LLVMValueRef buf, slot, val[4];
+   int i, offset;
+
+   slot = LLVMConstInt(ctx->ac.i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
+   buf = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
+   offset = semantic_name == TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL ? 4 : 0;
+
+   for (i = 0; i < 4; i++)
+      val[i] = si_buffer_load_const(ctx, buf, LLVMConstInt(ctx->ac.i32, (offset + i) * 4, 0));
+   return ac_build_gather_values(&ctx->ac, val, 4);
  }
  
-static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi,
-                                      unsigned varying_id,
-                                      bool load_default_state)
+static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi, unsigned varying_id,
+                                       bool load_default_state)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       unsigned semantic_name;
-
-       if (load_default_state) {
-               switch (varying_id) {
-               case VARYING_SLOT_TESS_LEVEL_INNER:
-                       semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL;
-                       break;
-               case VARYING_SLOT_TESS_LEVEL_OUTER:
-                       semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL;
-                       break;
-               default:
-                       unreachable("unknown tess level");
-               }
-               return load_tess_level_default(ctx, semantic_name);
-       }
-
-       switch (varying_id) {
-       case VARYING_SLOT_TESS_LEVEL_INNER:
-               semantic_name = TGSI_SEMANTIC_TESSINNER;
-               break;
-       case VARYING_SLOT_TESS_LEVEL_OUTER:
-               semantic_name = TGSI_SEMANTIC_TESSOUTER;
-               break;
-       default:
-               unreachable("unknown tess level");
-       }
-
-       return load_tess_level(ctx, semantic_name);
-
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   unsigned semantic_name;
+
+   if (load_default_state) {
+      switch (varying_id) {
+      case VARYING_SLOT_TESS_LEVEL_INNER:
+         semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL;
+         break;
+      case VARYING_SLOT_TESS_LEVEL_OUTER:
+         semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL;
+         break;
+      default:
+         unreachable("unknown tess level");
+      }
+      return load_tess_level_default(ctx, semantic_name);
+   }
+
+   switch (varying_id) {
+   case VARYING_SLOT_TESS_LEVEL_INNER:
+      semantic_name = TGSI_SEMANTIC_TESSINNER;
+      break;
+   case VARYING_SLOT_TESS_LEVEL_OUTER:
+      semantic_name = TGSI_SEMANTIC_TESSOUTER;
+      break;
+   default:
+      unreachable("unknown tess level");
+   }
+
+   return load_tess_level(ctx, semantic_name);
  }
  
  static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       if (ctx->type == PIPE_SHADER_TESS_CTRL)
-               return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6);
-       else if (ctx->type == PIPE_SHADER_TESS_EVAL)
-               return get_num_tcs_out_vertices(ctx);
-       else
-               unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   if (ctx->type == PIPE_SHADER_TESS_CTRL)
+      return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6);
+   else if (ctx->type == PIPE_SHADER_TESS_EVAL)
+      return get_num_tcs_out_vertices(ctx);
+   else
+      unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
  }
  
  /**
@@ -793,503 +708,450 @@ static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
   */
  static void si_copy_tcs_inputs(struct si_shader_context *ctx)
  {
-       LLVMValueRef invocation_id, buffer, buffer_offset;
-       LLVMValueRef lds_vertex_stride, lds_base;
-       uint64_t inputs;
+   LLVMValueRef invocation_id, buffer, buffer_offset;
+   LLVMValueRef lds_vertex_stride, lds_base;
+   uint64_t inputs;
  
-       invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
-       buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
-       buffer_offset = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+   invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
+   buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
+   buffer_offset = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
  
-       lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
-       lds_base = get_tcs_in_current_patch_offset(ctx);
-       lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride,
-                                lds_base);
+   lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
+   lds_base = get_tcs_in_current_patch_offset(ctx);
+   lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride, lds_base);
  
-       inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
-       while (inputs) {
-               unsigned i = u_bit_scan64(&inputs);
+   inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
+   while (inputs) {
+      unsigned i = u_bit_scan64(&inputs);
  
-               LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base,
-                                           LLVMConstInt(ctx->ac.i32, 4 * i, 0),
-                                            "");
+      LLVMValueRef lds_ptr =
+         LLVMBuildAdd(ctx->ac.builder, lds_base, LLVMConstInt(ctx->ac.i32, 4 * i, 0), "");
  
-               LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
-                                             get_rel_patch_id(ctx),
-                                             invocation_id,
-                                             LLVMConstInt(ctx->ac.i32, i, 0));
+      LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(
+         ctx, get_rel_patch_id(ctx), invocation_id, LLVMConstInt(ctx->ac.i32, i, 0));
  
-               LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr);
+      LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr);
  
-               ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
-                                           buffer_offset, 0, ac_glc);
-       }
+      ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr, buffer_offset, 0,
+                                  ac_glc);
+   }
  }
  
-static void si_write_tess_factors(struct si_shader_context *ctx,
-                                 LLVMValueRef rel_patch_id,
-                                 LLVMValueRef invocation_id,
-                                 LLVMValueRef tcs_out_current_patch_data_offset,
-                                 LLVMValueRef invoc0_tf_outer[4],
-                                 LLVMValueRef invoc0_tf_inner[2])
+static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef rel_patch_id,
+                                  LLVMValueRef invocation_id,
+                                  LLVMValueRef tcs_out_current_patch_data_offset,
+                                  LLVMValueRef invoc0_tf_outer[4], LLVMValueRef invoc0_tf_inner[2])
  {
-       struct si_shader *shader = ctx->shader;
-       unsigned tess_inner_index, tess_outer_index;
-       LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
-       LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
-       unsigned stride, outer_comps, inner_comps, i, offset;
-
-       /* Add a barrier before loading tess factors from LDS. */
-       if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
-               si_llvm_emit_barrier(ctx);
-
-       /* Do this only for invocation 0, because the tess levels are per-patch,
-        * not per-vertex.
-        *
-        * This can't jump, because invocation 0 executes this. It should
-        * at least mask out the loads and stores for other invocations.
-        */
-       ac_build_ifcc(&ctx->ac,
-                     LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
-                                   invocation_id, ctx->ac.i32_0, ""), 6503);
-
-       /* Determine the layout of one tess factor element in the buffer. */
-       switch (shader->key.part.tcs.epilog.prim_mode) {
-       case PIPE_PRIM_LINES:
-               stride = 2; /* 2 dwords, 1 vec2 store */
-               outer_comps = 2;
-               inner_comps = 0;
-               break;
-       case PIPE_PRIM_TRIANGLES:
-               stride = 4; /* 4 dwords, 1 vec4 store */
-               outer_comps = 3;
-               inner_comps = 1;
-               break;
-       case PIPE_PRIM_QUADS:
-               stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
-               outer_comps = 4;
-               inner_comps = 2;
-               break;
-       default:
-               assert(0);
-               return;
-       }
-
-       for (i = 0; i < 4; i++) {
-               inner[i] = LLVMGetUndef(ctx->ac.i32);
-               outer[i] = LLVMGetUndef(ctx->ac.i32);
-       }
-
-       if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
-               /* Tess factors are in VGPRs. */
-               for (i = 0; i < outer_comps; i++)
-                       outer[i] = out[i] = invoc0_tf_outer[i];
-               for (i = 0; i < inner_comps; i++)
-                       inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
-       } else {
-               /* Load tess_inner and tess_outer from LDS.
-                * Any invocation can write them, so we can't get them from a temporary.
-                */
-               tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
-               tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
-
-               lds_base = tcs_out_current_patch_data_offset;
-               lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
-                                        LLVMConstInt(ctx->ac.i32,
-                                                     tess_inner_index * 4, 0), "");
-               lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
-                                        LLVMConstInt(ctx->ac.i32,
-                                                     tess_outer_index * 4, 0), "");
-
-               for (i = 0; i < outer_comps; i++) {
-                       outer[i] = out[i] =
-                               lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer);
-               }
-               for (i = 0; i < inner_comps; i++) {
-                       inner[i] = out[outer_comps+i] =
-                               lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner);
-               }
-       }
-
-       if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
-               /* For isolines, the hardware expects tess factors in the
-                * reverse order from what NIR specifies.
-                */
-               LLVMValueRef tmp = out[0];
-               out[0] = out[1];
-               out[1] = tmp;
-       }
-
-       /* Convert the outputs to vectors for stores. */
-       vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
-       vec1 = NULL;
-
-       if (stride > 4)
-               vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4);
-
-       /* Get the buffer. */
-       buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
-
-       /* Get the offset. */
-       tf_base = ac_get_arg(&ctx->ac,
-                            ctx->tcs_factor_offset);
-       byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
-                                 LLVMConstInt(ctx->ac.i32, 4 * stride, 0), "");
-
-       ac_build_ifcc(&ctx->ac,
-                     LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
-                                   rel_patch_id, ctx->ac.i32_0, ""), 6504);
-
-       /* Store the dynamic HS control word. */
-       offset = 0;
-       if (ctx->screen->info.chip_class <= GFX8) {
-               ac_build_buffer_store_dword(&ctx->ac, buffer,
-                                           LLVMConstInt(ctx->ac.i32, 0x80000000, 0),
-                                           1, ctx->ac.i32_0, tf_base,
-                                           offset, ac_glc);
-               offset += 4;
-       }
-
-       ac_build_endif(&ctx->ac, 6504);
-
-       /* Store the tessellation factors. */
-       ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
-                                   MIN2(stride, 4), byteoffset, tf_base,
-                                   offset, ac_glc);
-       offset += 16;
-       if (vec1)
-               ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
-                                           stride - 4, byteoffset, tf_base,
-                                           offset, ac_glc);
-
-       /* Store the tess factors into the offchip buffer if TES reads them. */
-       if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
-               LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
-               LLVMValueRef tf_inner_offset;
-               unsigned param_outer, param_inner;
-
-               buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
-               base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-
-               param_outer = si_shader_io_get_unique_index_patch(
-                                     TGSI_SEMANTIC_TESSOUTER, 0);
-               tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
-                                       LLVMConstInt(ctx->ac.i32, param_outer, 0));
-
-               unsigned outer_vec_size =
-                       ac_has_vec3_support(ctx->screen->info.chip_class, false) ?
-                               outer_comps : util_next_power_of_two(outer_comps);
-               outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size);
-
-               ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
-                                           outer_comps, tf_outer_offset,
-                                           base, 0, ac_glc);
-               if (inner_comps) {
-                       param_inner = si_shader_io_get_unique_index_patch(
-                                             TGSI_SEMANTIC_TESSINNER, 0);
-                       tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
-                                       LLVMConstInt(ctx->ac.i32, param_inner, 0));
-
-                       inner_vec = inner_comps == 1 ? inner[0] :
-                                   ac_build_gather_values(&ctx->ac, inner, inner_comps);
-                       ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
-                                                   inner_comps, tf_inner_offset,
-                                                   base, 0, ac_glc);
-               }
-       }
-
-       ac_build_endif(&ctx->ac, 6503);
+   struct si_shader *shader = ctx->shader;
+   unsigned tess_inner_index, tess_outer_index;
+   LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
+   LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
+   unsigned stride, outer_comps, inner_comps, i, offset;
+
+   /* Add a barrier before loading tess factors from LDS. */
+   if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
+      si_llvm_emit_barrier(ctx);
+
+   /* Do this only for invocation 0, because the tess levels are per-patch,
+    * not per-vertex.
+    *
+    * This can't jump, because invocation 0 executes this. It should
+    * at least mask out the loads and stores for other invocations.
+    */
+   ac_build_ifcc(&ctx->ac,
+                 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, invocation_id, ctx->ac.i32_0, ""), 6503);
+
+   /* Determine the layout of one tess factor element in the buffer. */
+   switch (shader->key.part.tcs.epilog.prim_mode) {
+   case PIPE_PRIM_LINES:
+      stride = 2; /* 2 dwords, 1 vec2 store */
+      outer_comps = 2;
+      inner_comps = 0;
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      stride = 4; /* 4 dwords, 1 vec4 store */
+      outer_comps = 3;
+      inner_comps = 1;
+      break;
+   case PIPE_PRIM_QUADS:
+      stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
+      outer_comps = 4;
+      inner_comps = 2;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   for (i = 0; i < 4; i++) {
+      inner[i] = LLVMGetUndef(ctx->ac.i32);
+      outer[i] = LLVMGetUndef(ctx->ac.i32);
+   }
+
+   if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
+      /* Tess factors are in VGPRs. */
+      for (i = 0; i < outer_comps; i++)
+         outer[i] = out[i] = invoc0_tf_outer[i];
+      for (i = 0; i < inner_comps; i++)
+         inner[i] = out[outer_comps + i] = invoc0_tf_inner[i];
+   } else {
+      /* Load tess_inner and tess_outer from LDS.
+       * Any invocation can write them, so we can't get them from a temporary.
+       */
+      tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
+      tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
+
+      lds_base = tcs_out_current_patch_data_offset;
+      lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
+                               LLVMConstInt(ctx->ac.i32, tess_inner_index * 4, 0), "");
+      lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
+                               LLVMConstInt(ctx->ac.i32, tess_outer_index * 4, 0), "");
+
+      for (i = 0; i < outer_comps; i++) {
+         outer[i] = out[i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer);
+      }
+      for (i = 0; i < inner_comps; i++) {
+         inner[i] = out[outer_comps + i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner);
+      }
+   }
+
+   if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
+      /* For isolines, the hardware expects tess factors in the
+       * reverse order from what NIR specifies.
+       */
+      LLVMValueRef tmp = out[0];
+      out[0] = out[1];
+      out[1] = tmp;
+   }
+
+   /* Convert the outputs to vectors for stores. */
+   vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
+   vec1 = NULL;
+
+   if (stride > 4)
+      vec1 = ac_build_gather_values(&ctx->ac, out + 4, stride - 4);
+
+   /* Get the buffer. */
+   buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
+
+   /* Get the offset. */
+   tf_base = ac_get_arg(&ctx->ac, ctx->tcs_factor_offset);
+   byteoffset =
+      LLVMBuildMul(ctx->ac.builder, rel_patch_id, LLVMConstInt(ctx->ac.i32, 4 * stride, 0), "");
+
+   ac_build_ifcc(&ctx->ac,
+                 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, rel_patch_id, ctx->ac.i32_0, ""), 6504);
+
+   /* Store the dynamic HS control word. */
+   offset = 0;
+   if (ctx->screen->info.chip_class <= GFX8) {
+      ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, 0), 1,
+                                  ctx->ac.i32_0, tf_base, offset, ac_glc);
+      offset += 4;
+   }
+
+   ac_build_endif(&ctx->ac, 6504);
+
+   /* Store the tessellation factors. */
+   ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, MIN2(stride, 4), byteoffset, tf_base, offset,
+                               ac_glc);
+   offset += 16;
+   if (vec1)
+      ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, stride - 4, byteoffset, tf_base, offset,
+                                  ac_glc);
+
+   /* Store the tess factors into the offchip buffer if TES reads them. */
+   if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
+      LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
+      LLVMValueRef tf_inner_offset;
+      unsigned param_outer, param_inner;
+
+      buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
+      base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+
+      param_outer = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
+      tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
+                                                   LLVMConstInt(ctx->ac.i32, param_outer, 0));
+
+      unsigned outer_vec_size = ac_has_vec3_support(ctx->screen->info.chip_class, false)
+                                   ? outer_comps
+                                   : util_next_power_of_two(outer_comps);
+      outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size);
+
+      ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, outer_comps, tf_outer_offset, base, 0,
+                                  ac_glc);
+      if (inner_comps) {
+         param_inner = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
+         tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
+                                                      LLVMConstInt(ctx->ac.i32, param_inner, 0));
+
+         inner_vec =
+            inner_comps == 1 ? inner[0] : ac_build_gather_values(&ctx->ac, inner, inner_comps);
+         ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, inner_comps, tf_inner_offset, base,
+                                     0, ac_glc);
+      }
+   }
+
+   ac_build_endif(&ctx->ac, 6503);
  }
  
  /* This only writes the tessellation factor levels. */
-static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi,
-                                     unsigned max_outputs,
-                                     LLVMValueRef *addrs)
+static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+                                      LLVMValueRef *addrs)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
-
-       si_copy_tcs_inputs(ctx);
-
-       rel_patch_id = get_rel_patch_id(ctx);
-       invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
-       tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
-
-       if (ctx->screen->info.chip_class >= GFX9) {
-               LLVMBasicBlockRef blocks[2] = {
-                       LLVMGetInsertBlock(builder),
-                       ctx->merged_wrap_if_entry_block
-               };
-               LLVMValueRef values[2];
-
-               ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-
-               values[0] = rel_patch_id;
-               values[1] = LLVMGetUndef(ctx->ac.i32);
-               rel_patch_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
-
-               values[0] = tf_lds_offset;
-               values[1] = LLVMGetUndef(ctx->ac.i32);
-               tf_lds_offset = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
-
-               values[0] = invocation_id;
-               values[1] = ctx->ac.i32_1; /* cause the epilog to skip threads */
-               invocation_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
-       }
-
-       /* Return epilog parameters from this function. */
-       LLVMValueRef ret = ctx->return_value;
-       unsigned vgpr;
-
-       if (ctx->screen->info.chip_class >= GFX9) {
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
-                                         8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
-                                         8 + GFX9_SGPR_TCS_OUT_LAYOUT);
-               /* Tess offchip and tess factor offsets are at the beginning. */
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
-               vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
-       } else {
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
-                                         GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
-                                         GFX6_SGPR_TCS_OUT_LAYOUT);
-               /* Tess offchip and tess factor offsets are after user SGPRs. */
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset,
-                                         GFX6_TCS_NUM_USER_SGPR);
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset,
-                                         GFX6_TCS_NUM_USER_SGPR + 1);
-               vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
-       }
-
-       /* VGPRs */
-       rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
-       invocation_id = ac_to_float(&ctx->ac, invocation_id);
-       tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
-
-       /* Leave a hole corresponding to the two input VGPRs. This ensures that
-        * the invocation_id output does not alias the tcs_rel_ids input,
-        * which saves a V_MOV on gfx9.
-        */
-       vgpr += 2;
-
-       ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
-       ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
-
-       if (ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
-               vgpr++; /* skip the tess factor LDS offset */
-               for (unsigned i = 0; i < 6; i++) {
-                       LLVMValueRef value =
-                               LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
-                       value = ac_to_float(&ctx->ac, value);
-                       ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
-               }
-       } else {
-               ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
-       }
-       ctx->return_value = ret;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
+
+   si_copy_tcs_inputs(ctx);
+
+   rel_patch_id = get_rel_patch_id(ctx);
+   invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
+   tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
+
+   if (ctx->screen->info.chip_class >= GFX9) {
+      LLVMBasicBlockRef blocks[2] = {LLVMGetInsertBlock(builder), ctx->merged_wrap_if_entry_block};
+      LLVMValueRef values[2];
+
+      ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+      values[0] = rel_patch_id;
+      values[1] = LLVMGetUndef(ctx->ac.i32);
+      rel_patch_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
+
+      values[0] = tf_lds_offset;
+      values[1] = LLVMGetUndef(ctx->ac.i32);
+      tf_lds_offset = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
+
+      values[0] = invocation_id;
+      values[1] = ctx->ac.i32_1; /* cause the epilog to skip threads */
+      invocation_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
+   }
+
+   /* Return epilog parameters from this function. */
+   LLVMValueRef ret = ctx->return_value;
+   unsigned vgpr;
+
+   if (ctx->screen->info.chip_class >= GFX9) {
+      ret =
+         si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
+      /* Tess offchip and tess factor offsets are at the beginning. */
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
+      vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
+   } else {
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, GFX6_SGPR_TCS_OUT_LAYOUT);
+      /* Tess offchip and tess factor offsets are after user SGPRs. */
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, GFX6_TCS_NUM_USER_SGPR);
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, GFX6_TCS_NUM_USER_SGPR + 1);
+      vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
+   }
+
+   /* VGPRs */
+   rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
+   invocation_id = ac_to_float(&ctx->ac, invocation_id);
+   tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
+
+   /* Leave a hole corresponding to the two input VGPRs. This ensures that
+    * the invocation_id output does not alias the tcs_rel_ids input,
+    * which saves a V_MOV on gfx9.
+    */
+   vgpr += 2;
+
+   ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
+   ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
+
+   if (ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
+      vgpr++; /* skip the tess factor LDS offset */
+      for (unsigned i = 0; i < 6; i++) {
+         LLVMValueRef value = LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
+         value = ac_to_float(&ctx->ac, value);
+         ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
+      }
+   } else {
+      ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
+   }
+   ctx->return_value = ret;
  }
  
  /* Pass TCS inputs from LS to TCS on GFX9. */
  static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
  {
-       LLVMValueRef ret = ctx->return_value;
-
-       ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
-       ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
-       ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
-       ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
-       ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
-       ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
-
-       ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
-                                 8 + SI_SGPR_RW_BUFFERS);
-       ret = si_insert_input_ptr(ctx, ret,
-                                 ctx->bindless_samplers_and_images,
-                                 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
-
-       ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits,
-                                 8 + SI_SGPR_VS_STATE_BITS);
-
-       ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
-                                 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
-       ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets,
-                                 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
-       ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
-                                 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
-
-       unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
-       ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                  ac_to_float(&ctx->ac,
-                                              ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)),
-                                  vgpr++, "");
-       ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                  ac_to_float(&ctx->ac,
-                                              ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)),
-                                  vgpr++, "");
-       ctx->return_value = ret;
+   LLVMValueRef ret = ctx->return_value;
+
+   ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
+   ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
+   ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
+   ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
+
+   ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
+   ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
+                             8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
+
+   ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
+
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets, 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
+
+   unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
+   ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
+                              ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)),
+                              vgpr++, "");
+   ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
+                              ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)),
+                              vgpr++, "");
+   ctx->return_value = ret;
  }
  
-void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-                             LLVMValueRef *addrs)
+void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader *shader = ctx->shader;
-       struct si_shader_info *info = &shader->selector->info;
-       unsigned i, chan;
-       LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id);
-       LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
-       LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
-                                                vertex_dw_stride, "");
-
-       /* Write outputs to LDS. The next shader (TCS aka HS) will read
-        * its inputs from it. */
-       for (i = 0; i < info->num_outputs; i++) {
-               unsigned name = info->output_semantic_name[i];
-               unsigned index = info->output_semantic_index[i];
-
-               /* The ARB_shader_viewport_layer_array spec contains the
-                * following issue:
-                *
-                *    2) What happens if gl_ViewportIndex or gl_Layer is
-                *    written in the vertex shader and a geometry shader is
-                *    present?
-                *
-                *    RESOLVED: The value written by the last vertex processing
-                *    stage is used. If the last vertex processing stage
-                *    (vertex, tessellation evaluation or geometry) does not
-                *    statically assign to gl_ViewportIndex or gl_Layer, index
-                *    or layer zero is assumed.
-                *
-                * So writes to those outputs in VS-as-LS are simply ignored.
-                */
-               if (name == TGSI_SEMANTIC_LAYER ||
-                   name == TGSI_SEMANTIC_VIEWPORT_INDEX)
-                       continue;
-
-               int param = si_shader_io_get_unique_index(name, index, false);
-               LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
-                                       LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
-
-               for (chan = 0; chan < 4; chan++) {
-                       if (!(info->output_usagemask[i] & (1 << chan)))
-                               continue;
-
-                       lshs_lds_store(ctx, chan, dw_addr,
-                                 LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
-               }
-       }
-
-       if (ctx->screen->info.chip_class >= GFX9)
-               si_set_ls_return_value_for_tcs(ctx);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader *shader = ctx->shader;
+   struct si_shader_info *info = &shader->selector->info;
+   unsigned i, chan;
+   LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id);
+   LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
+   LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, "");
+
+   /* Write outputs to LDS. The next shader (TCS aka HS) will read
+    * its inputs from it. */
+   for (i = 0; i < info->num_outputs; i++) {
+      unsigned name = info->output_semantic_name[i];
+      unsigned index = info->output_semantic_index[i];
+
+      /* The ARB_shader_viewport_layer_array spec contains the
+       * following issue:
+       *
+       *    2) What happens if gl_ViewportIndex or gl_Layer is
+       *    written in the vertex shader and a geometry shader is
+       *    present?
+       *
+       *    RESOLVED: The value written by the last vertex processing
+       *    stage is used. If the last vertex processing stage
+       *    (vertex, tessellation evaluation or geometry) does not
+       *    statically assign to gl_ViewportIndex or gl_Layer, index
+       *    or layer zero is assumed.
+       *
+       * So writes to those outputs in VS-as-LS are simply ignored.
+       */
+      if (name == TGSI_SEMANTIC_LAYER || name == TGSI_SEMANTIC_VIEWPORT_INDEX)
+         continue;
+
+      int param = si_shader_io_get_unique_index(name, index, false);
+      LLVMValueRef dw_addr =
+         LLVMBuildAdd(ctx->ac.builder, base_dw_addr, LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
+
+      for (chan = 0; chan < 4; chan++) {
+         if (!(info->output_usagemask[i] & (1 << chan)))
+            continue;
+
+         lshs_lds_store(ctx, chan, dw_addr,
+                        LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
+      }
+   }
+
+   if (ctx->screen->info.chip_class >= GFX9)
+      si_set_ls_return_value_for_tcs(ctx);
  }
  
  /**
   * Compile the TCS epilog function. This writes tesselation factors to memory
   * based on the output primitive type of the tesselator (determined by TES).
   */
-void si_llvm_build_tcs_epilog(struct si_shader_context *ctx,
-                             union si_shader_part_key *key)
+void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
  {
-       memset(&ctx->args, 0, sizeof(ctx->args));
-
-       if (ctx->screen->info.chip_class >= GFX9) {
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_offchip_offset);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_factor_offset);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_offchip_layout);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_out_lds_layout);
-       } else {
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_offchip_layout);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_out_lds_layout);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_offchip_offset);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_factor_offset);
-       }
-
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
-       struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id);
-       struct ac_arg invocation_id; /* invocation ID within the patch */
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id);
-       struct ac_arg tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-                  &tcs_out_current_patch_data_offset);
-
-       struct ac_arg tess_factors[6];
-       for (unsigned i = 0; i < 6; i++)
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);
-
-       /* Create the function. */
-       si_llvm_create_func(ctx, "tcs_epilog", NULL, 0,
-                           ctx->screen->info.chip_class >= GFX7 ? 128 : 0);
-       ac_declare_lds_as_pointer(&ctx->ac);
-
-       LLVMValueRef invoc0_tess_factors[6];
-       for (unsigned i = 0; i < 6; i++)
-               invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
-
-       si_write_tess_factors(ctx,
-                             ac_get_arg(&ctx->ac, rel_patch_id),
-                             ac_get_arg(&ctx->ac, invocation_id),
-                             ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
-                             invoc0_tess_factors, invoc0_tess_factors + 4);
-
-       LLVMBuildRetVoid(ctx->ac.builder);
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   if (ctx->screen->info.chip_class >= GFX9) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
+   } else {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
+   }
+
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
+   struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id);
+   struct ac_arg invocation_id; /* invocation ID within the patch */
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id);
+   struct ac_arg
+      tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tcs_out_current_patch_data_offset);
+
+   struct ac_arg tess_factors[6];
+   for (unsigned i = 0; i < 6; i++)
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);
+
+   /* Create the function. */
+   si_llvm_create_func(ctx, "tcs_epilog", NULL, 0, ctx->screen->info.chip_class >= GFX7 ? 128 : 0);
+   ac_declare_lds_as_pointer(&ctx->ac);
+
+   LLVMValueRef invoc0_tess_factors[6];
+   for (unsigned i = 0; i < 6; i++)
+      invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
+
+   si_write_tess_factors(ctx, ac_get_arg(&ctx->ac, rel_patch_id),
+                         ac_get_arg(&ctx->ac, invocation_id),
+                         ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
+                         invoc0_tess_factors, invoc0_tess_factors + 4);
+
+   LLVMBuildRetVoid(ctx->ac.builder);
  }
  
  void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx)
  {
-       ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
-       ctx->abi.load_tess_level = si_load_tess_level;
-       ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
-       ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
-       ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
+   ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
+   ctx->abi.load_tess_level = si_load_tess_level;
+   ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
+   ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
+   ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
  }
  
  void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
  {
-       ctx->abi.load_tess_varyings = si_nir_load_input_tes;
-       ctx->abi.load_tess_coord = si_load_tess_coord;
-       ctx->abi.load_tess_level = si_load_tess_level;
-       ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
-
-       if (ctx->shader->key.as_es)
-               ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
-       else if (ngg_cull_shader)
-               ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
-       else if (ctx->shader->key.as_ngg)
-               ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
-       else
-               ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
+   ctx->abi.load_tess_varyings = si_nir_load_input_tes;
+   ctx->abi.load_tess_coord = si_load_tess_coord;
+   ctx->abi.load_tess_level = si_load_tess_level;
+   ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
+
+   if (ctx->shader->key.as_es)
+      ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
+   else if (ngg_cull_shader)
+      ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
+   else if (ctx->shader->key.as_ngg)
+      ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
+   else
+      ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
  }
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c

index 39c06f41ecea441b8dc1f0be4e89fa97901d969c..8640150b18c206c5b691d4600a84f84669149532 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@@ -22,518 +22,463 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "si_shader_internal.h"
  #include "si_pipe.h"
+#include "si_shader_internal.h"
  #include "sid.h"
  #include "util/u_memory.h"
  
-static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
-                                LLVMValueRef i32, unsigned index)
+static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index)
  {
-       assert(index <= 1);
+   assert(index <= 1);
  
-       if (index == 1)
-               return LLVMBuildAShr(ctx->ac.builder, i32,
-                                    LLVMConstInt(ctx->ac.i32, 16, 0), "");
+   if (index == 1)
+      return LLVMBuildAShr(ctx->ac.builder, i32, LLVMConstInt(ctx->ac.i32, 16, 0), "");
  
-       return LLVMBuildSExt(ctx->ac.builder,
-                            LLVMBuildTrunc(ctx->ac.builder, i32,
-                                           ctx->ac.i16, ""),
-                            ctx->ac.i32, "");
+   return LLVMBuildSExt(ctx->ac.builder, LLVMBuildTrunc(ctx->ac.builder, i32, ctx->ac.i16, ""),
+                        ctx->ac.i32, "");
  }
  
-static void load_input_vs(struct si_shader_context *ctx, unsigned input_index,
-                         LLVMValueRef out[4])
+static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4])
  {
-       const struct si_shader_info *info = &ctx->shader->selector->info;
-       unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
-
-       if (vs_blit_property) {
-               LLVMValueRef vertex_id = ctx->abi.vertex_id;
-               LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
-                                                   LLVMIntULE, vertex_id,
-                                                   ctx->ac.i32_1, "");
-               /* Use LLVMIntNE, because we have 3 vertices and only
-                * the middle one should use y2.
-                */
-               LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
-                                                   LLVMIntNE, vertex_id,
-                                                   ctx->ac.i32_1, "");
-
-               unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
-               if (input_index == 0) {
-                       /* Position: */
-                       LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
-                                                        param_vs_blit_inputs);
-                       LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
-                                                        param_vs_blit_inputs + 1);
-
-                       LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
-                       LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
-                       LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
-                       LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
-
-                       LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
-                                                        x1, x2, "");
-                       LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
-                                                        y1, y2, "");
-
-                       out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
-                       out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
-                       out[2] = LLVMGetParam(ctx->main_fn,
-                                             param_vs_blit_inputs + 2);
-                       out[3] = ctx->ac.f32_1;
-                       return;
-               }
-
-               /* Color or texture coordinates: */
-               assert(input_index == 1);
-
-               if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
-                       for (int i = 0; i < 4; i++) {
-                               out[i] = LLVMGetParam(ctx->main_fn,
-                                                     param_vs_blit_inputs + 3 + i);
-                       }
-               } else {
-                       assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
-                       LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
-                                                      param_vs_blit_inputs + 3);
-                       LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
-                                                      param_vs_blit_inputs + 4);
-                       LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
-                                                      param_vs_blit_inputs + 5);
-                       LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
-                                                      param_vs_blit_inputs + 6);
-
-                       out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
-                                                x1, x2, "");
-                       out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
-                                                y1, y2, "");
-                       out[2] = LLVMGetParam(ctx->main_fn,
-                                             param_vs_blit_inputs + 7);
-                       out[3] = LLVMGetParam(ctx->main_fn,
-                                             param_vs_blit_inputs + 8);
-               }
-               return;
-       }
-
-       unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
-       union si_vs_fix_fetch fix_fetch;
-       LLVMValueRef vb_desc;
-       LLVMValueRef vertex_index;
-       LLVMValueRef tmp;
-
-       if (input_index < num_vbos_in_user_sgprs) {
-               vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
-       } else {
-               unsigned index= input_index - num_vbos_in_user_sgprs;
-               vb_desc = ac_build_load_to_sgpr(&ctx->ac,
-                                               ac_get_arg(&ctx->ac, ctx->vertex_buffers),
-                                               LLVMConstInt(ctx->ac.i32, index, 0));
-       }
-
-       vertex_index = LLVMGetParam(ctx->main_fn,
-                                   ctx->vertex_index0.arg_index +
-                                   input_index);
-
-       /* Use the open-coded implementation for all loads of doubles and
-        * of dword-sized data that needs fixups. We need to insert conversion
-        * code anyway, and the amd/common code does it for us.
-        *
-        * Note: On LLVM <= 8, we can only open-code formats with
-        * channel size >= 4 bytes.
-        */
-       bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
-       fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
-       if (opencode ||
-           (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
-           (fix_fetch.u.log_size == 2)) {
-               tmp = ac_build_opencoded_load_format(
-                               &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
-                               fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
-                               vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
-               for (unsigned i = 0; i < 4; ++i)
-                       out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
-               return;
-       }
-
-       /* Do multiple loads for special formats. */
-       unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
-       LLVMValueRef fetches[4];
-       unsigned num_fetches;
-       unsigned fetch_stride;
-       unsigned channels_per_fetch;
-
-       if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
-               num_fetches = MIN2(required_channels, 3);
-               fetch_stride = 1 << fix_fetch.u.log_size;
-               channels_per_fetch = 1;
-       } else {
-               num_fetches = 1;
-               fetch_stride = 0;
-               channels_per_fetch = required_channels;
-       }
-
-       for (unsigned i = 0; i < num_fetches; ++i) {
-               LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
-               fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
-                                                        channels_per_fetch, 0, true);
-       }
-
-       if (num_fetches == 1 && channels_per_fetch > 1) {
-               LLVMValueRef fetch = fetches[0];
-               for (unsigned i = 0; i < channels_per_fetch; ++i) {
-                       tmp = LLVMConstInt(ctx->ac.i32, i, false);
-                       fetches[i] = LLVMBuildExtractElement(
-                               ctx->ac.builder, fetch, tmp, "");
-               }
-               num_fetches = channels_per_fetch;
-               channels_per_fetch = 1;
-       }
-
-       for (unsigned i = num_fetches; i < 4; ++i)
-               fetches[i] = LLVMGetUndef(ctx->ac.f32);
-
-       if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
-           required_channels == 4) {
-               if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
-                       fetches[3] = ctx->ac.i32_1;
-               else
-                       fetches[3] = ctx->ac.f32_1;
-       } else if (fix_fetch.u.log_size == 3 &&
-                  (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
-                   fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
-                   fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
-                  required_channels == 4) {
-               /* For 2_10_10_10, the hardware returns an unsigned value;
-                * convert it to a signed one.
-                */
-               LLVMValueRef tmp = fetches[3];
-               LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
-
-               /* First, recover the sign-extended signed integer value. */
-               if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
-                       tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, "");
-               else
-                       tmp = ac_to_integer(&ctx->ac, tmp);
-
-               /* For the integer-like cases, do a natural sign extension.
-                *
-                * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
-                * and happen to contain 0, 1, 2, 3 as the two LSBs of the
-                * exponent.
-                */
-               tmp = LLVMBuildShl(ctx->ac.builder, tmp,
-                                  fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
-                                  LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
-               tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
-
-               /* Convert back to the right type. */
-               if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
-                       LLVMValueRef clamp;
-                       LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
-                       tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
-                       clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
-                       tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
-               } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
-                       tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
-               }
-
-               fetches[3] = tmp;
-       }
-
-       for (unsigned i = 0; i < 4; ++i)
-               out[i] = ac_to_float(&ctx->ac, fetches[i]);
+   const struct si_shader_info *info = &ctx->shader->selector->info;
+   unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+
+   if (vs_blit_property) {
+      LLVMValueRef vertex_id = ctx->abi.vertex_id;
+      LLVMValueRef sel_x1 =
+         LLVMBuildICmp(ctx->ac.builder, LLVMIntULE, vertex_id, ctx->ac.i32_1, "");
+      /* Use LLVMIntNE, because we have 3 vertices and only
+       * the middle one should use y2.
+       */
+      LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, vertex_id, ctx->ac.i32_1, "");
+
+      unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
+      if (input_index == 0) {
+         /* Position: */
+         LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs);
+         LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 1);
+
+         LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
+         LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
+         LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
+         LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
+
+         LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
+         LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
+
+         out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
+         out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
+         out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 2);
+         out[3] = ctx->ac.f32_1;
+         return;
+      }
+
+      /* Color or texture coordinates: */
+      assert(input_index == 1);
+
+      if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
+         for (int i = 0; i < 4; i++) {
+            out[i] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3 + i);
+         }
+      } else {
+         assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
+         LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3);
+         LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 4);
+         LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 5);
+         LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 6);
+
+         out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
+         out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
+         out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 7);
+         out[3] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 8);
+      }
+      return;
+   }
+
+   unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
+   union si_vs_fix_fetch fix_fetch;
+   LLVMValueRef vb_desc;
+   LLVMValueRef vertex_index;
+   LLVMValueRef tmp;
+
+   if (input_index < num_vbos_in_user_sgprs) {
+      vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
+   } else {
+      unsigned index = input_index - num_vbos_in_user_sgprs;
+      vb_desc = ac_build_load_to_sgpr(&ctx->ac, ac_get_arg(&ctx->ac, ctx->vertex_buffers),
+                                      LLVMConstInt(ctx->ac.i32, index, 0));
+   }
+
+   vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index);
+
+   /* Use the open-coded implementation for all loads of doubles and
+    * of dword-sized data that needs fixups. We need to insert conversion
+    * code anyway, and the amd/common code does it for us.
+    *
+    * Note: On LLVM <= 8, we can only open-code formats with
+    * channel size >= 4 bytes.
+    */
+   bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
+   fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
+   if (opencode || (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
+       (fix_fetch.u.log_size == 2)) {
+      tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size,
+                                           fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format,
+                                           fix_fetch.u.reverse, !opencode, vb_desc, vertex_index,
+                                           ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
+      for (unsigned i = 0; i < 4; ++i)
+         out[i] =
+            LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
+      return;
+   }
+
+   /* Do multiple loads for special formats. */
+   unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
+   LLVMValueRef fetches[4];
+   unsigned num_fetches;
+   unsigned fetch_stride;
+   unsigned channels_per_fetch;
+
+   if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
+      num_fetches = MIN2(required_channels, 3);
+      fetch_stride = 1 << fix_fetch.u.log_size;
+      channels_per_fetch = 1;
+   } else {
+      num_fetches = 1;
+      fetch_stride = 0;
+      channels_per_fetch = required_channels;
+   }
+
+   for (unsigned i = 0; i < num_fetches; ++i) {
+      LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
+      fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
+                                               channels_per_fetch, 0, true);
+   }
+
+   if (num_fetches == 1 && channels_per_fetch > 1) {
+      LLVMValueRef fetch = fetches[0];
+      for (unsigned i = 0; i < channels_per_fetch; ++i) {
+         tmp = LLVMConstInt(ctx->ac.i32, i, false);
+         fetches[i] = LLVMBuildExtractElement(ctx->ac.builder, fetch, tmp, "");
+      }
+      num_fetches = channels_per_fetch;
+      channels_per_fetch = 1;
+   }
+
+   for (unsigned i = num_fetches; i < 4; ++i)
+      fetches[i] = LLVMGetUndef(ctx->ac.f32);
+
+   if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && required_channels == 4) {
+      if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
+         fetches[3] = ctx->ac.i32_1;
+      else
+         fetches[3] = ctx->ac.f32_1;
+   } else if (fix_fetch.u.log_size == 3 &&
+              (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
+               fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
+               fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
+              required_channels == 4) {
+      /* For 2_10_10_10, the hardware returns an unsigned value;
+       * convert it to a signed one.
+       */
+      LLVMValueRef tmp = fetches[3];
+      LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
+
+      /* First, recover the sign-extended signed integer value. */
+      if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
+         tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, "");
+      else
+         tmp = ac_to_integer(&ctx->ac, tmp);
+
+      /* For the integer-like cases, do a natural sign extension.
+       *
+       * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
+       * and happen to contain 0, 1, 2, 3 as the two LSBs of the
+       * exponent.
+       */
+      tmp = LLVMBuildShl(
+         ctx->ac.builder, tmp,
+         fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
+      tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
+
+      /* Convert back to the right type. */
+      if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
+         LLVMValueRef clamp;
+         LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
+         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
+         clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
+         tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
+      } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
+         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
+      }
+
+      fetches[3] = tmp;
+   }
+
+   for (unsigned i = 0; i < 4; ++i)
+      out[i] = ac_to_float(&ctx->ac, fetches[i]);
  }
  
  static void declare_input_vs(struct si_shader_context *ctx, unsigned input_index)
  {
-       LLVMValueRef input[4];
+   LLVMValueRef input[4];
  
-       load_input_vs(ctx, input_index / 4, input);
+   load_input_vs(ctx, input_index / 4, input);
  
-       for (unsigned chan = 0; chan < 4; chan++) {
-               ctx->inputs[input_index + chan] =
-                       LLVMBuildBitCast(ctx->ac.builder, input[chan], ctx->ac.i32, "");
-       }
+   for (unsigned chan = 0; chan < 4; chan++) {
+      ctx->inputs[input_index + chan] =
+         LLVMBuildBitCast(ctx->ac.builder, input[chan], ctx->ac.i32, "");
+   }
  }
  
  void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir)
  {
-       uint64_t processed_inputs = 0;
-
-       nir_foreach_variable(variable, &nir->inputs) {
-               unsigned attrib_count = glsl_count_attribute_slots(variable->type,
-                                                                  true);
-               unsigned input_idx = variable->data.driver_location;
-               unsigned loc = variable->data.location;
-
-               for (unsigned i = 0; i < attrib_count; i++) {
-                       /* Packed components share the same location so skip
-                        * them if we have already processed the location.
-                        */
-                       if (processed_inputs & ((uint64_t)1 << (loc + i))) {
-                               input_idx += 4;
-                               continue;
-                       }
-
-                       declare_input_vs(ctx, input_idx);
-                       if (glsl_type_is_dual_slot(variable->type)) {
-                               input_idx += 4;
-                               declare_input_vs(ctx, input_idx);
-                       }
-
-                       processed_inputs |= ((uint64_t)1 << (loc + i));
-                       input_idx += 4;
-               }
-       }
+   uint64_t processed_inputs = 0;
+
+   nir_foreach_variable (variable, &nir->inputs) {
+      unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
+      unsigned input_idx = variable->data.driver_location;
+      unsigned loc = variable->data.location;
+
+      for (unsigned i = 0; i < attrib_count; i++) {
+         /* Packed components share the same location so skip
+          * them if we have already processed the location.
+          */
+         if (processed_inputs & ((uint64_t)1 << (loc + i))) {
+            input_idx += 4;
+            continue;
+         }
+
+         declare_input_vs(ctx, input_idx);
+         if (glsl_type_is_dual_slot(variable->type)) {
+            input_idx += 4;
+            declare_input_vs(ctx, input_idx);
+         }
+
+         processed_inputs |= ((uint64_t)1 << (loc + i));
+         input_idx += 4;
+      }
+   }
  }
  
-void si_llvm_streamout_store_output(struct si_shader_context *ctx,
-                                   LLVMValueRef const *so_buffers,
-                                   LLVMValueRef const *so_write_offsets,
-                                   struct pipe_stream_output *stream_out,
-                                   struct si_shader_output_values *shader_out)
+void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
+                                    LLVMValueRef const *so_write_offsets,
+                                    struct pipe_stream_output *stream_out,
+                                    struct si_shader_output_values *shader_out)
  {
-       unsigned buf_idx = stream_out->output_buffer;
-       unsigned start = stream_out->start_component;
-       unsigned num_comps = stream_out->num_components;
-       LLVMValueRef out[4];
-
-       assert(num_comps && num_comps <= 4);
-       if (!num_comps || num_comps > 4)
-               return;
-
-       /* Load the output as int. */
-       for (int j = 0; j < num_comps; j++) {
-               assert(stream_out->stream == shader_out->vertex_stream[start + j]);
-
-               out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
-       }
-
-       /* Pack the output. */
-       LLVMValueRef vdata = NULL;
-
-       switch (num_comps) {
-       case 1: /* as i32 */
-               vdata = out[0];
-               break;
-       case 2: /* as v2i32 */
-       case 3: /* as v3i32 */
-               if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
-                       vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
-                       break;
-               }
-               /* as v4i32 (aligned to 4) */
-               out[3] = LLVMGetUndef(ctx->ac.i32);
-               /* fall through */
-       case 4: /* as v4i32 */
-               vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
-               break;
-       }
-
-       ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
-                                   vdata, num_comps,
-                                   so_write_offsets[buf_idx],
-                                   ctx->ac.i32_0,
-                                   stream_out->dst_offset * 4, ac_glc | ac_slc);
+   unsigned buf_idx = stream_out->output_buffer;
+   unsigned start = stream_out->start_component;
+   unsigned num_comps = stream_out->num_components;
+   LLVMValueRef out[4];
+
+   assert(num_comps && num_comps <= 4);
+   if (!num_comps || num_comps > 4)
+      return;
+
+   /* Load the output as int. */
+   for (int j = 0; j < num_comps; j++) {
+      assert(stream_out->stream == shader_out->vertex_stream[start + j]);
+
+      out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
+   }
+
+   /* Pack the output. */
+   LLVMValueRef vdata = NULL;
+
+   switch (num_comps) {
+   case 1: /* as i32 */
+      vdata = out[0];
+      break;
+   case 2: /* as v2i32 */
+   case 3: /* as v3i32 */
+      if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
+         vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
+         break;
+      }
+      /* as v4i32 (aligned to 4) */
+      out[3] = LLVMGetUndef(ctx->ac.i32);
+      /* fall through */
+   case 4: /* as v4i32 */
+      vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
+      break;
+   }
+
+   ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], vdata, num_comps,
+                               so_write_offsets[buf_idx], ctx->ac.i32_0, stream_out->dst_offset * 4,
+                               ac_glc | ac_slc);
  }
  
  /**
   * Write streamout data to buffers for vertex stream @p stream (different
   * vertex streams can occur for GS copy shaders).
   */
-void si_llvm_emit_streamout(struct si_shader_context *ctx,
-                           struct si_shader_output_values *outputs,
-                           unsigned noutput, unsigned stream)
+void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
+                            unsigned noutput, unsigned stream)
  {
-       struct si_shader_selector *sel = ctx->shader->selector;
-       struct pipe_stream_output_info *so = &sel->so;
-       LLVMBuilderRef builder = ctx->ac.builder;
-       int i;
-
-       /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
-       LLVMValueRef so_vtx_count =
-               si_unpack_param(ctx, ctx->streamout_config, 16, 7);
-
-       LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
-
-       /* can_emit = tid < so_vtx_count; */
-       LLVMValueRef can_emit =
-               LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
-
-       /* Emit the streamout code conditionally. This actually avoids
-        * out-of-bounds buffer access. The hw tells us via the SGPR
-        * (so_vtx_count) which threads are allowed to emit streamout data. */
-       ac_build_ifcc(&ctx->ac, can_emit, 6501);
-       {
-               /* The buffer offset is computed as follows:
-                *   ByteOffset = streamout_offset[buffer_id]*4 +
-                *                (streamout_write_index + thread_id)*stride[buffer_id] +
-                *                attrib_offset
-                 */
-
-               LLVMValueRef so_write_index =
-                       ac_get_arg(&ctx->ac,
-                                  ctx->streamout_write_index);
-
-               /* Compute (streamout_write_index + thread_id). */
-               so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
-
-               /* Load the descriptor and compute the write offset for each
-                * enabled buffer. */
-               LLVMValueRef so_write_offset[4] = {};
-               LLVMValueRef so_buffers[4];
-               LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac,
-                                                 ctx->rw_buffers);
-
-               for (i = 0; i < 4; i++) {
-                       if (!so->stride[i])
-                               continue;
-
-                       LLVMValueRef offset = LLVMConstInt(ctx->ac.i32,
-                                                          SI_VS_STREAMOUT_BUF0 + i, 0);
-
-                       so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-
-                       LLVMValueRef so_offset = ac_get_arg(&ctx->ac,
-                                                           ctx->streamout_offset[i]);
-                       so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
-
-                       so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
-                                                          LLVMConstInt(ctx->ac.i32, so->stride[i]*4, 0),
-                                                          so_offset);
-               }
-
-               /* Write streamout data. */
-               for (i = 0; i < so->num_outputs; i++) {
-                       unsigned reg = so->output[i].register_index;
-
-                       if (reg >= noutput)
-                               continue;
-
-                       if (stream != so->output[i].stream)
-                               continue;
-
-                       si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset,
-                                                      &so->output[i], &outputs[reg]);
-               }
-       }
-       ac_build_endif(&ctx->ac, 6501);
+   struct si_shader_selector *sel = ctx->shader->selector;
+   struct pipe_stream_output_info *so = &sel->so;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   int i;
+
+   /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
+   LLVMValueRef so_vtx_count = si_unpack_param(ctx, ctx->streamout_config, 16, 7);
+
+   LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
+
+   /* can_emit = tid < so_vtx_count; */
+   LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
+
+   /* Emit the streamout code conditionally. This actually avoids
+    * out-of-bounds buffer access. The hw tells us via the SGPR
+    * (so_vtx_count) which threads are allowed to emit streamout data. */
+   ac_build_ifcc(&ctx->ac, can_emit, 6501);
+   {
+      /* The buffer offset is computed as follows:
+       *   ByteOffset = streamout_offset[buffer_id]*4 +
+       *                (streamout_write_index + thread_id)*stride[buffer_id] +
+       *                attrib_offset
+       */
+
+      LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->streamout_write_index);
+
+      /* Compute (streamout_write_index + thread_id). */
+      so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
+
+      /* Load the descriptor and compute the write offset for each
+       * enabled buffer. */
+      LLVMValueRef so_write_offset[4] = {};
+      LLVMValueRef so_buffers[4];
+      LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+
+      for (i = 0; i < 4; i++) {
+         if (!so->stride[i])
+            continue;
+
+         LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + i, 0);
+
+         so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
+
+         LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->streamout_offset[i]);
+         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+         so_write_offset[i] = ac_build_imad(
+            &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, so->stride[i] * 4, 0), so_offset);
+      }
+
+      /* Write streamout data. */
+      for (i = 0; i < so->num_outputs; i++) {
+         unsigned reg = so->output[i].register_index;
+
+         if (reg >= noutput)
+            continue;
+
+         if (stream != so->output[i].stream)
+            continue;
+
+         si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset, &so->output[i],
+                                        &outputs[reg]);
+      }
+   }
+   ac_build_endif(&ctx->ac, 6501);
  }
  
-static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
-                                   struct ac_export_args *pos, LLVMValueRef *out_elts)
+static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, struct ac_export_args *pos,
+                                    LLVMValueRef *out_elts)
  {
-       unsigned reg_index;
-       unsigned chan;
-       unsigned const_chan;
-       LLVMValueRef base_elt;
-       LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-       LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32,
-                                                  SI_VS_CONST_CLIP_PLANES, 0);
-       LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
-
-       for (reg_index = 0; reg_index < 2; reg_index ++) {
-               struct ac_export_args *args = &pos[2 + reg_index];
-
-               args->out[0] =
-               args->out[1] =
-               args->out[2] =
-               args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f);
-
-               /* Compute dot products of position and user clip plane vectors */
-               for (chan = 0; chan < 4; chan++) {
-                       for (const_chan = 0; const_chan < 4; const_chan++) {
-                               LLVMValueRef addr =
-                                       LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 +
-                                                               const_chan) * 4, 0);
-                               base_elt = si_buffer_load_const(ctx, const_resource,
-                                                               addr);
-                               args->out[chan] = ac_build_fmad(&ctx->ac, base_elt,
-                                                               out_elts[const_chan], args->out[chan]);
-                       }
-               }
-
-               args->enabled_channels = 0xf;
-               args->valid_mask = 0;
-               args->done = 0;
-               args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
-               args->compr = 0;
-       }
+   unsigned reg_index;
+   unsigned chan;
+   unsigned const_chan;
+   LLVMValueRef base_elt;
+   LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_CLIP_PLANES, 0);
+   LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
+
+   for (reg_index = 0; reg_index < 2; reg_index++) {
+      struct ac_export_args *args = &pos[2 + reg_index];
+
+      args->out[0] = args->out[1] = args->out[2] = args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f);
+
+      /* Compute dot products of position and user clip plane vectors */
+      for (chan = 0; chan < 4; chan++) {
+         for (const_chan = 0; const_chan < 4; const_chan++) {
+            LLVMValueRef addr =
+               LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + const_chan) * 4, 0);
+            base_elt = si_buffer_load_const(ctx, const_resource, addr);
+            args->out[chan] =
+               ac_build_fmad(&ctx->ac, base_elt, out_elts[const_chan], args->out[chan]);
+         }
+      }
+
+      args->enabled_channels = 0xf;
+      args->valid_mask = 0;
+      args->done = 0;
+      args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
+      args->compr = 0;
+   }
  }
  
  /* Initialize arguments for the shader export intrinsic */
-static void si_llvm_init_vs_export_args(struct si_shader_context *ctx,
-                                       LLVMValueRef *values,
-                                       unsigned target,
-                                       struct ac_export_args *args)
+static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, LLVMValueRef *values,
+                                        unsigned target, struct ac_export_args *args)
  {
-       args->enabled_channels = 0xf; /* writemask - default is 0xf */
-       args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */
-       args->done = 0; /* Specify whether this is the last export */
-       args->target = target; /* Specify the target we are exporting */
-       args->compr = false;
+   args->enabled_channels = 0xf; /* writemask - default is 0xf */
+   args->valid_mask = 0;         /* Specify whether the EXEC mask represents the valid mask */
+   args->done = 0;               /* Specify whether this is the last export */
+   args->target = target;        /* Specify the target we are exporting */
+   args->compr = false;
  
-       memcpy(&args->out[0], values, sizeof(values[0]) * 4);
+   memcpy(&args->out[0], values, sizeof(values[0]) * 4);
  }
  
-static void si_export_param(struct si_shader_context *ctx, unsigned index,
-                           LLVMValueRef *values)
+static void si_export_param(struct si_shader_context *ctx, unsigned index, LLVMValueRef *values)
  {
-       struct ac_export_args args;
+   struct ac_export_args args;
  
-       si_llvm_init_vs_export_args(ctx, values,
-                                   V_008DFC_SQ_EXP_PARAM + index, &args);
-       ac_build_export(&ctx->ac, &args);
+   si_llvm_init_vs_export_args(ctx, values, V_008DFC_SQ_EXP_PARAM + index, &args);
+   ac_build_export(&ctx->ac, &args);
  }
  
  static void si_build_param_exports(struct si_shader_context *ctx,
-                                  struct si_shader_output_values *outputs,
-                                  unsigned noutput)
+                                   struct si_shader_output_values *outputs, unsigned noutput)
  {
-       struct si_shader *shader = ctx->shader;
-       unsigned param_count = 0;
-
-       for (unsigned i = 0; i < noutput; i++) {
-               unsigned semantic_name = outputs[i].semantic_name;
-               unsigned semantic_index = outputs[i].semantic_index;
-
-               if (outputs[i].vertex_stream[0] != 0 &&
-                   outputs[i].vertex_stream[1] != 0 &&
-                   outputs[i].vertex_stream[2] != 0 &&
-                   outputs[i].vertex_stream[3] != 0)
-                       continue;
-
-               switch (semantic_name) {
-               case TGSI_SEMANTIC_LAYER:
-               case TGSI_SEMANTIC_VIEWPORT_INDEX:
-               case TGSI_SEMANTIC_CLIPDIST:
-               case TGSI_SEMANTIC_COLOR:
-               case TGSI_SEMANTIC_BCOLOR:
-               case TGSI_SEMANTIC_PRIMID:
-               case TGSI_SEMANTIC_FOG:
-               case TGSI_SEMANTIC_TEXCOORD:
-               case TGSI_SEMANTIC_GENERIC:
-                       break;
-               default:
-                       continue;
-               }
-
-               if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
-                    semantic_index < SI_MAX_IO_GENERIC) &&
-                   shader->key.opt.kill_outputs &
-                   (1ull << si_shader_io_get_unique_index(semantic_name,
-                                                          semantic_index, true)))
-                       continue;
-
-               si_export_param(ctx, param_count, outputs[i].values);
-
-               assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
-               shader->info.vs_output_param_offset[i] = param_count++;
-       }
-
-       shader->info.nr_param_exports = param_count;
+   struct si_shader *shader = ctx->shader;
+   unsigned param_count = 0;
+
+   for (unsigned i = 0; i < noutput; i++) {
+      unsigned semantic_name = outputs[i].semantic_name;
+      unsigned semantic_index = outputs[i].semantic_index;
+
+      if (outputs[i].vertex_stream[0] != 0 && outputs[i].vertex_stream[1] != 0 &&
+          outputs[i].vertex_stream[2] != 0 && outputs[i].vertex_stream[3] != 0)
+         continue;
+
+      switch (semantic_name) {
+      case TGSI_SEMANTIC_LAYER:
+      case TGSI_SEMANTIC_VIEWPORT_INDEX:
+      case TGSI_SEMANTIC_CLIPDIST:
+      case TGSI_SEMANTIC_COLOR:
+      case TGSI_SEMANTIC_BCOLOR:
+      case TGSI_SEMANTIC_PRIMID:
+      case TGSI_SEMANTIC_FOG:
+      case TGSI_SEMANTIC_TEXCOORD:
+      case TGSI_SEMANTIC_GENERIC:
+         break;
+      default:
+         continue;
+      }
+
+      if ((semantic_name != TGSI_SEMANTIC_GENERIC || semantic_index < SI_MAX_IO_GENERIC) &&
+          shader->key.opt.kill_outputs &
+             (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index, true)))
+         continue;
+
+      si_export_param(ctx, param_count, outputs[i].values);
+
+      assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
+      shader->info.vs_output_param_offset[i] = param_count++;
+   }
+
+   shader->info.nr_param_exports = param_count;
  }
  
  /**
@@ -544,296 +489,272 @@ static void si_build_param_exports(struct si_shader_context *ctx,
   * is true.
   */
  static void si_vertex_color_clamping(struct si_shader_context *ctx,
-                                    struct si_shader_output_values *outputs,
-                                    unsigned noutput)
+                                     struct si_shader_output_values *outputs, unsigned noutput)
  {
-       LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
-       bool has_colors = false;
-
-       /* Store original colors to alloca variables. */
-       for (unsigned i = 0; i < noutput; i++) {
-               if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
-                   outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
-                       continue;
-
-               for (unsigned j = 0; j < 4; j++) {
-                       addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
-                       LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
-               }
-               has_colors = true;
-       }
-
-       if (!has_colors)
-               return;
-
-       /* The state is in the first bit of the user SGPR. */
-       LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
-       cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
-
-       ac_build_ifcc(&ctx->ac, cond, 6502);
-
-       /* Store clamped colors to alloca variables within the conditional block. */
-       for (unsigned i = 0; i < noutput; i++) {
-               if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
-                   outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
-                       continue;
-
-               for (unsigned j = 0; j < 4; j++) {
-                       LLVMBuildStore(ctx->ac.builder,
-                                      ac_build_clamp(&ctx->ac, outputs[i].values[j]),
-                                      addr[i][j]);
-               }
-       }
-       ac_build_endif(&ctx->ac, 6502);
-
-       /* Load clamped colors */
-       for (unsigned i = 0; i < noutput; i++) {
-               if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
-                   outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
-                       continue;
-
-               for (unsigned j = 0; j < 4; j++) {
-                       outputs[i].values[j] =
-                               LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
-               }
-       }
+   LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
+   bool has_colors = false;
+
+   /* Store original colors to alloca variables. */
+   for (unsigned i = 0; i < noutput; i++) {
+      if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
+          outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
+         continue;
+
+      for (unsigned j = 0; j < 4; j++) {
+         addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
+         LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
+      }
+      has_colors = true;
+   }
+
+   if (!has_colors)
+      return;
+
+   /* The state is in the first bit of the user SGPR. */
+   LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
+   cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
+
+   ac_build_ifcc(&ctx->ac, cond, 6502);
+
+   /* Store clamped colors to alloca variables within the conditional block. */
+   for (unsigned i = 0; i < noutput; i++) {
+      if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
+          outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
+         continue;
+
+      for (unsigned j = 0; j < 4; j++) {
+         LLVMBuildStore(ctx->ac.builder, ac_build_clamp(&ctx->ac, outputs[i].values[j]),
+                        addr[i][j]);
+      }
+   }
+   ac_build_endif(&ctx->ac, 6502);
+
+   /* Load clamped colors */
+   for (unsigned i = 0; i < noutput; i++) {
+      if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
+          outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
+         continue;
+
+      for (unsigned j = 0; j < 4; j++) {
+         outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
+      }
+   }
  }
  
  /* Generate export instructions for hardware VS shader stage or NGG GS stage
   * (position and parameter data only).
   */
  void si_llvm_build_vs_exports(struct si_shader_context *ctx,
-                             struct si_shader_output_values *outputs,
-                             unsigned noutput)
+                              struct si_shader_output_values *outputs, unsigned noutput)
  {
-       struct si_shader *shader = ctx->shader;
-       struct ac_export_args pos_args[4] = {};
-       LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
-       unsigned pos_idx;
-       int i;
-
-       si_vertex_color_clamping(ctx, outputs, noutput);
-
-       /* Build position exports. */
-       for (i = 0; i < noutput; i++) {
-               switch (outputs[i].semantic_name) {
-               case TGSI_SEMANTIC_POSITION:
-                       si_llvm_init_vs_export_args(ctx, outputs[i].values,
-                                                   V_008DFC_SQ_EXP_POS, &pos_args[0]);
-                       break;
-               case TGSI_SEMANTIC_PSIZE:
-                       psize_value = outputs[i].values[0];
-                       break;
-               case TGSI_SEMANTIC_LAYER:
-                       layer_value = outputs[i].values[0];
-                       break;
-               case TGSI_SEMANTIC_VIEWPORT_INDEX:
-                       viewport_index_value = outputs[i].values[0];
-                       break;
-               case TGSI_SEMANTIC_EDGEFLAG:
-                       edgeflag_value = outputs[i].values[0];
-                       break;
-               case TGSI_SEMANTIC_CLIPDIST:
-                       if (!shader->key.opt.clip_disable) {
-                               unsigned index = 2 + outputs[i].semantic_index;
-                               si_llvm_init_vs_export_args(ctx, outputs[i].values,
-                                                           V_008DFC_SQ_EXP_POS + index,
-                                                           &pos_args[index]);
-                       }
-                       break;
-               case TGSI_SEMANTIC_CLIPVERTEX:
-                       if (!shader->key.opt.clip_disable) {
-                               si_llvm_emit_clipvertex(ctx, pos_args,
-                                                       outputs[i].values);
-                       }
-                       break;
-               }
-       }
-
-       /* We need to add the position output manually if it's missing. */
-       if (!pos_args[0].out[0]) {
-               pos_args[0].enabled_channels = 0xf; /* writemask */
-               pos_args[0].valid_mask = 0; /* EXEC mask */
-               pos_args[0].done = 0; /* last export? */
-               pos_args[0].target = V_008DFC_SQ_EXP_POS;
-               pos_args[0].compr = 0; /* COMPR flag */
-               pos_args[0].out[0] = ctx->ac.f32_0; /* X */
-               pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
-               pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
-               pos_args[0].out[3] = ctx->ac.f32_1;  /* W */
-       }
-
-       bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag &&
-                                  !shader->key.as_ngg;
-
-       /* Write the misc vector (point size, edgeflag, layer, viewport). */
-       if (shader->selector->info.writes_psize ||
-           pos_writes_edgeflag ||
-           shader->selector->info.writes_viewport_index ||
-           shader->selector->info.writes_layer) {
-               pos_args[1].enabled_channels = shader->selector->info.writes_psize |
-                                              (pos_writes_edgeflag << 1) |
-                                              (shader->selector->info.writes_layer << 2);
-
-               pos_args[1].valid_mask = 0; /* EXEC mask */
-               pos_args[1].done = 0; /* last export? */
-               pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
-               pos_args[1].compr = 0; /* COMPR flag */
-               pos_args[1].out[0] = ctx->ac.f32_0; /* X */
-               pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
-               pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
-               pos_args[1].out[3] = ctx->ac.f32_0; /* W */
-
-               if (shader->selector->info.writes_psize)
-                       pos_args[1].out[0] = psize_value;
-
-               if (pos_writes_edgeflag) {
-                       /* The output is a float, but the hw expects an integer
-                        * with the first bit containing the edge flag. */
-                       edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
-                                                        edgeflag_value,
-                                                        ctx->ac.i32, "");
-                       edgeflag_value = ac_build_umin(&ctx->ac,
-                                                     edgeflag_value,
-                                                     ctx->ac.i32_1);
-
-                       /* The LLVM intrinsic expects a float. */
-                       pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
-               }
-
-               if (ctx->screen->info.chip_class >= GFX9) {
-                       /* GFX9 has the layer in out.z[10:0] and the viewport
-                        * index in out.z[19:16].
-                        */
-                       if (shader->selector->info.writes_layer)
-                               pos_args[1].out[2] = layer_value;
-
-                       if (shader->selector->info.writes_viewport_index) {
-                               LLVMValueRef v = viewport_index_value;
-
-                               v = ac_to_integer(&ctx->ac, v);
-                               v = LLVMBuildShl(ctx->ac.builder, v,
-                                                LLVMConstInt(ctx->ac.i32, 16, 0), "");
-                               v = LLVMBuildOr(ctx->ac.builder, v,
-                                               ac_to_integer(&ctx->ac,  pos_args[1].out[2]), "");
-                               pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
-                               pos_args[1].enabled_channels |= 1 << 2;
-                       }
-               } else {
-                       if (shader->selector->info.writes_layer)
-                               pos_args[1].out[2] = layer_value;
-
-                       if (shader->selector->info.writes_viewport_index) {
-                               pos_args[1].out[3] = viewport_index_value;
-                               pos_args[1].enabled_channels |= 1 << 3;
-                       }
-               }
-       }
-
-       for (i = 0; i < 4; i++)
-               if (pos_args[i].out[0])
-                       shader->info.nr_pos_exports++;
-
-       /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
-        * Setting valid_mask=1 prevents it and has no other effect.
-        */
-       if (ctx->screen->info.family == CHIP_NAVI10 ||
-           ctx->screen->info.family == CHIP_NAVI12 ||
-           ctx->screen->info.family == CHIP_NAVI14)
-               pos_args[0].valid_mask = 1;
-
-       pos_idx = 0;
-       for (i = 0; i < 4; i++) {
-               if (!pos_args[i].out[0])
-                       continue;
-
-               /* Specify the target we are exporting */
-               pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
-
-               if (pos_idx == shader->info.nr_pos_exports)
-                       /* Specify that this is the last export */
-                       pos_args[i].done = 1;
-
-               ac_build_export(&ctx->ac, &pos_args[i]);
-       }
-
-       /* Build parameter exports. */
-       si_build_param_exports(ctx, outputs, noutput);
+   struct si_shader *shader = ctx->shader;
+   struct ac_export_args pos_args[4] = {};
+   LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL,
+                viewport_index_value = NULL;
+   unsigned pos_idx;
+   int i;
+
+   si_vertex_color_clamping(ctx, outputs, noutput);
+
+   /* Build position exports. */
+   for (i = 0; i < noutput; i++) {
+      switch (outputs[i].semantic_name) {
+      case TGSI_SEMANTIC_POSITION:
+         si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS, &pos_args[0]);
+         break;
+      case TGSI_SEMANTIC_PSIZE:
+         psize_value = outputs[i].values[0];
+         break;
+      case TGSI_SEMANTIC_LAYER:
+         layer_value = outputs[i].values[0];
+         break;
+      case TGSI_SEMANTIC_VIEWPORT_INDEX:
+         viewport_index_value = outputs[i].values[0];
+         break;
+      case TGSI_SEMANTIC_EDGEFLAG:
+         edgeflag_value = outputs[i].values[0];
+         break;
+      case TGSI_SEMANTIC_CLIPDIST:
+         if (!shader->key.opt.clip_disable) {
+            unsigned index = 2 + outputs[i].semantic_index;
+            si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS + index,
+                                        &pos_args[index]);
+         }
+         break;
+      case TGSI_SEMANTIC_CLIPVERTEX:
+         if (!shader->key.opt.clip_disable) {
+            si_llvm_emit_clipvertex(ctx, pos_args, outputs[i].values);
+         }
+         break;
+      }
+   }
+
+   /* We need to add the position output manually if it's missing. */
+   if (!pos_args[0].out[0]) {
+      pos_args[0].enabled_channels = 0xf; /* writemask */
+      pos_args[0].valid_mask = 0;         /* EXEC mask */
+      pos_args[0].done = 0;               /* last export? */
+      pos_args[0].target = V_008DFC_SQ_EXP_POS;
+      pos_args[0].compr = 0;              /* COMPR flag */
+      pos_args[0].out[0] = ctx->ac.f32_0; /* X */
+      pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
+      pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
+      pos_args[0].out[3] = ctx->ac.f32_1; /* W */
+   }
+
+   bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && !shader->key.as_ngg;
+
+   /* Write the misc vector (point size, edgeflag, layer, viewport). */
+   if (shader->selector->info.writes_psize || pos_writes_edgeflag ||
+       shader->selector->info.writes_viewport_index || shader->selector->info.writes_layer) {
+      pos_args[1].enabled_channels = shader->selector->info.writes_psize |
+                                     (pos_writes_edgeflag << 1) |
+                                     (shader->selector->info.writes_layer << 2);
+
+      pos_args[1].valid_mask = 0; /* EXEC mask */
+      pos_args[1].done = 0;       /* last export? */
+      pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
+      pos_args[1].compr = 0;              /* COMPR flag */
+      pos_args[1].out[0] = ctx->ac.f32_0; /* X */
+      pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
+      pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
+      pos_args[1].out[3] = ctx->ac.f32_0; /* W */
+
+      if (shader->selector->info.writes_psize)
+         pos_args[1].out[0] = psize_value;
+
+      if (pos_writes_edgeflag) {
+         /* The output is a float, but the hw expects an integer
+          * with the first bit containing the edge flag. */
+         edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, edgeflag_value, ctx->ac.i32, "");
+         edgeflag_value = ac_build_umin(&ctx->ac, edgeflag_value, ctx->ac.i32_1);
+
+         /* The LLVM intrinsic expects a float. */
+         pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
+      }
+
+      if (ctx->screen->info.chip_class >= GFX9) {
+         /* GFX9 has the layer in out.z[10:0] and the viewport
+          * index in out.z[19:16].
+          */
+         if (shader->selector->info.writes_layer)
+            pos_args[1].out[2] = layer_value;
+
+         if (shader->selector->info.writes_viewport_index) {
+            LLVMValueRef v = viewport_index_value;
+
+            v = ac_to_integer(&ctx->ac, v);
+            v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, 0), "");
+            v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
+            pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
+            pos_args[1].enabled_channels |= 1 << 2;
+         }
+      } else {
+         if (shader->selector->info.writes_layer)
+            pos_args[1].out[2] = layer_value;
+
+         if (shader->selector->info.writes_viewport_index) {
+            pos_args[1].out[3] = viewport_index_value;
+            pos_args[1].enabled_channels |= 1 << 3;
+         }
+      }
+   }
+
+   for (i = 0; i < 4; i++)
+      if (pos_args[i].out[0])
+         shader->info.nr_pos_exports++;
+
+   /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
+    * Setting valid_mask=1 prevents it and has no other effect.
+    */
+   if (ctx->screen->info.family == CHIP_NAVI10 || ctx->screen->info.family == CHIP_NAVI12 ||
+       ctx->screen->info.family == CHIP_NAVI14)
+      pos_args[0].valid_mask = 1;
+
+   pos_idx = 0;
+   for (i = 0; i < 4; i++) {
+      if (!pos_args[i].out[0])
+         continue;
+
+      /* Specify the target we are exporting */
+      pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
+
+      if (pos_idx == shader->info.nr_pos_exports)
+         /* Specify that this is the last export */
+         pos_args[i].done = 1;
+
+      ac_build_export(&ctx->ac, &pos_args[i]);
+   }
+
+   /* Build parameter exports. */
+   si_build_param_exports(ctx, outputs, noutput);
  }
  
-void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-                             LLVMValueRef *addrs)
+void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader_info *info = &ctx->shader->selector->info;
-       struct si_shader_output_values *outputs = NULL;
-       int i,j;
-
-       assert(!ctx->shader->is_gs_copy_shader);
-       assert(info->num_outputs <= max_outputs);
-
-       outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
-
-       for (i = 0; i < info->num_outputs; i++) {
-               outputs[i].semantic_name = info->output_semantic_name[i];
-               outputs[i].semantic_index = info->output_semantic_index[i];
-
-               for (j = 0; j < 4; j++) {
-                       outputs[i].values[j] =
-                               LLVMBuildLoad(ctx->ac.builder,
-                                             addrs[4 * i + j],
-                                             "");
-                       outputs[i].vertex_stream[j] =
-                               (info->output_streams[i] >> (2 * j)) & 3;
-               }
-       }
-
-       if (!ctx->screen->use_ngg_streamout &&
-           ctx->shader->selector->so.num_outputs)
-               si_llvm_emit_streamout(ctx, outputs, i, 0);
-
-       /* Export PrimitiveID. */
-       if (ctx->shader->key.mono.u.vs_export_prim_id) {
-               outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
-               outputs[i].semantic_index = 0;
-               outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
-               for (j = 1; j < 4; j++)
-                       outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
-
-               memset(outputs[i].vertex_stream, 0,
-                      sizeof(outputs[i].vertex_stream));
-               i++;
-       }
-
-       si_llvm_build_vs_exports(ctx, outputs, i);
-       FREE(outputs);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   struct si_shader_output_values *outputs = NULL;
+   int i, j;
+
+   assert(!ctx->shader->is_gs_copy_shader);
+   assert(info->num_outputs <= max_outputs);
+
+   outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
+
+   for (i = 0; i < info->num_outputs; i++) {
+      outputs[i].semantic_name = info->output_semantic_name[i];
+      outputs[i].semantic_index = info->output_semantic_index[i];
+
+      for (j = 0; j < 4; j++) {
+         outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
+         outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
+      }
+   }
+
+   if (!ctx->screen->use_ngg_streamout && ctx->shader->selector->so.num_outputs)
+      si_llvm_emit_streamout(ctx, outputs, i, 0);
+
+   /* Export PrimitiveID. */
+   if (ctx->shader->key.mono.u.vs_export_prim_id) {
+      outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
+      outputs[i].semantic_index = 0;
+      outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
+      for (j = 1; j < 4; j++)
+         outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
+
+      memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));
+      i++;
+   }
+
+   si_llvm_build_vs_exports(ctx, outputs, i);
+   FREE(outputs);
  }
  
-static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
-                                                 unsigned max_outputs,
-                                                 LLVMValueRef *addrs)
+static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+                                                  LLVMValueRef *addrs)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader_info *info = &ctx->shader->selector->info;
-       LLVMValueRef pos[4] = {};
-
-       assert(info->num_outputs <= max_outputs);
-
-       for (unsigned i = 0; i < info->num_outputs; i++) {
-               if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
-                       continue;
-
-               for (unsigned chan = 0; chan < 4; chan++)
-                       pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
-               break;
-       }
-       assert(pos[0] != NULL);
-
-       /* Return the position output. */
-       LLVMValueRef ret = ctx->return_value;
-       for (unsigned chan = 0; chan < 4; chan++)
-               ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
-       ctx->return_value = ret;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   LLVMValueRef pos[4] = {};
+
+   assert(info->num_outputs <= max_outputs);
+
+   for (unsigned i = 0; i < info->num_outputs; i++) {
+      if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
+         continue;
+
+      for (unsigned chan = 0; chan < 4; chan++)
+         pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+      break;
+   }
+   assert(pos[0] != NULL);
+
+   /* Return the position output. */
+   LLVMValueRef ret = ctx->return_value;
+   for (unsigned chan = 0; chan < 4; chan++)
+      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
+   ctx->return_value = ret;
  }
  
  /**
@@ -852,280 +773,252 @@ static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
   *   (InstanceID + StartInstance),
   *   (InstanceID / 2 + StartInstance)
   */
-void si_llvm_build_vs_prolog(struct si_shader_context *ctx,
-                            union si_shader_part_key *key)
+void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
  {
-       LLVMTypeRef *returns;
-       LLVMValueRef ret, func;
-       int num_returns, i;
-       unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
-       unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4 +
-                                  (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0);
-       struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
-       struct ac_arg input_vgpr_param[10];
-       LLVMValueRef input_vgprs[10];
-       unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
-                                     num_input_vgprs;
-       unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
-
-       memset(&ctx->args, 0, sizeof(ctx->args));
-
-       /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
-       returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) *
-                        sizeof(LLVMTypeRef));
-       num_returns = 0;
-
-       /* Declare input and output SGPRs. */
-       for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &input_sgpr_param[i]);
-               returns[num_returns++] = ctx->ac.i32;
-       }
-
-       struct ac_arg merged_wave_info = input_sgpr_param[3];
-
-       /* Preloaded VGPRs (outputs must be floats) */
-       for (i = 0; i < num_input_vgprs; i++) {
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
-               returns[num_returns++] = ctx->ac.f32;
-       }
-
-       /* Vertex load indices. */
-       for (i = 0; i < key->vs_prolog.num_inputs; i++)
-               returns[num_returns++] = ctx->ac.f32;
-
-       /* Create the function. */
-       si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
-       func = ctx->main_fn;
-
-       for (i = 0; i < num_input_vgprs; i++) {
-               input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
-       }
-
-       if (key->vs_prolog.num_merged_next_stage_vgprs) {
-               if (!key->vs_prolog.is_monolithic)
-                       si_init_exec_from_input(ctx, merged_wave_info, 0);
-
-               if (key->vs_prolog.as_ls &&
-                   ctx->screen->info.has_ls_vgpr_init_bug) {
-                       /* If there are no HS threads, SPI loads the LS VGPRs
-                        * starting at VGPR 0. Shift them back to where they
-                        * belong.
-                        */
-                       LLVMValueRef has_hs_threads =
-                               LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
-                                   si_unpack_param(ctx, input_sgpr_param[3], 8, 8),
-                                   ctx->ac.i32_0, "");
-
-                       for (i = 4; i > 0; --i) {
-                               input_vgprs[i + 1] =
-                                       LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
-                                                       input_vgprs[i + 1],
-                                                       input_vgprs[i - 1], "");
-                       }
-               }
-       }
-
-       if (key->vs_prolog.gs_fast_launch_tri_list ||
-           key->vs_prolog.gs_fast_launch_tri_strip) {
-               LLVMValueRef wave_id, thread_id_in_tg;
-
-               wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
-               thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id,
-                                               LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
-                                               ac_get_thread_id(&ctx->ac));
-
-               /* The GS fast launch initializes all VGPRs to the value of
-                * the first thread, so we have to add the thread ID.
-                *
-                * Only these are initialized by the hw:
-                *   VGPR2: Base Primitive ID
-                *   VGPR5: Base Vertex ID
-                *   VGPR6: Instance ID
-                */
-
-               /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
-                * The NGG cull shader will read them from there.
-                */
-               if (key->vs_prolog.gs_fast_launch_tri_list) {
-                       input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */
-                                                      LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
-                                                      LLVMConstInt(ctx->ac.i32, 0, 0));
-                       input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */
-                                                      LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
-                                                      LLVMConstInt(ctx->ac.i32, 1, 0));
-                       input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */
-                                                      LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
-                                                      LLVMConstInt(ctx->ac.i32, 2, 0));
-               } else {
-                       assert(key->vs_prolog.gs_fast_launch_tri_strip);
-                       LLVMBuilderRef builder = ctx->ac.builder;
-                       /* Triangle indices: */
-                       LLVMValueRef index[3] = {
-                               thread_id_in_tg,
-                               LLVMBuildAdd(builder, thread_id_in_tg,
-                                            LLVMConstInt(ctx->ac.i32, 1, 0), ""),
-                               LLVMBuildAdd(builder, thread_id_in_tg,
-                                            LLVMConstInt(ctx->ac.i32, 2, 0), ""),
-                       };
-                       LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder,
-                                                            thread_id_in_tg, ctx->ac.i1, "");
-                       LLVMValueRef flatshade_first =
-                               LLVMBuildICmp(builder, LLVMIntEQ,
-                                             si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
-                                             ctx->ac.i32_0, "");
-
-                       ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
-                                                                   flatshade_first, index);
-                       input_vgprs[0] = index[0];
-                       input_vgprs[1] = index[1];
-                       input_vgprs[4] = index[2];
-               }
-
-               /* Triangles always have all edge flags set initially. */
-               input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
-
-               input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2],
-                                             thread_id_in_tg, ""); /* PrimID */
-               input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5],
-                                             thread_id_in_tg, ""); /* VertexID */
-               input_vgprs[8] = input_vgprs[6]; /* InstanceID */
-       }
-
-       unsigned vertex_id_vgpr = first_vs_vgpr;
-       unsigned instance_id_vgpr =
-               ctx->screen->info.chip_class >= GFX10 ?
-                       first_vs_vgpr + 3 :
-                       first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
-
-       ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
-       ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
-
-       /* InstanceID = VertexID >> 16;
-        * VertexID   = VertexID & 0xffff;
-        */
-       if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
-               ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id,
-                                                    LLVMConstInt(ctx->ac.i32, 16, 0), "");
-               ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
-                                                 LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
-       }
-
-       /* Copy inputs to outputs. This should be no-op, as the registers match,
-        * but it will prevent the compiler from overwriting them unintentionally.
-        */
-       ret = ctx->return_value;
-       for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
-               LLVMValueRef p = LLVMGetParam(func, i);
-               ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
-       }
-       for (i = 0; i < num_input_vgprs; i++) {
-               LLVMValueRef p = input_vgprs[i];
-
-               if (i == vertex_id_vgpr)
-                       p = ctx->abi.vertex_id;
-               else if (i == instance_id_vgpr)
-                       p = ctx->abi.instance_id;
-
-               p = ac_to_float(&ctx->ac, p);
-               ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
-                                          key->vs_prolog.num_input_sgprs + i, "");
-       }
-
-       /* Compute vertex load indices from instance divisors. */
-       LLVMValueRef instance_divisor_constbuf = NULL;
-
-       if (key->vs_prolog.states.instance_divisor_is_fetched) {
-               LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
-               LLVMValueRef buf_index =
-                       LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
-               instance_divisor_constbuf =
-                       ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
-       }
-
-       for (i = 0; i < key->vs_prolog.num_inputs; i++) {
-               bool divisor_is_one =
-                       key->vs_prolog.states.instance_divisor_is_one & (1u << i);
-               bool divisor_is_fetched =
-                       key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
-               LLVMValueRef index = NULL;
-
-               if (divisor_is_one) {
-                       index = ctx->abi.instance_id;
-               } else if (divisor_is_fetched) {
-                       LLVMValueRef udiv_factors[4];
-
-                       for (unsigned j = 0; j < 4; j++) {
-                               udiv_factors[j] =
-                                       si_buffer_load_const(ctx, instance_divisor_constbuf,
-                                                            LLVMConstInt(ctx->ac.i32, i*16 + j*4, 0));
-                               udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
-                       }
-                       /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
-                        * Such InstanceID might not be achievable in a reasonable time though.
-                        */
-                       index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
-                                                      udiv_factors[0], udiv_factors[1],
-                                                      udiv_factors[2], udiv_factors[3]);
-               }
-
-               if (divisor_is_one || divisor_is_fetched) {
-                       /* Add StartInstance. */
-                       index = LLVMBuildAdd(ctx->ac.builder, index,
-                                            LLVMGetParam(ctx->main_fn, user_sgpr_base +
-                                                         SI_SGPR_START_INSTANCE), "");
-               } else {
-                       /* VertexID + BaseVertex */
-                       index = LLVMBuildAdd(ctx->ac.builder,
-                                            ctx->abi.vertex_id,
-                                            LLVMGetParam(func, user_sgpr_base +
-                                                               SI_SGPR_BASE_VERTEX), "");
-               }
-
-               index = ac_to_float(&ctx->ac, index);
-               ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
-                                          ctx->args.arg_count + i, "");
-       }
-
-       si_llvm_build_ret(ctx, ret);
+   LLVMTypeRef *returns;
+   LLVMValueRef ret, func;
+   int num_returns, i;
+   unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
+   unsigned num_input_vgprs =
+      key->vs_prolog.num_merged_next_stage_vgprs + 4 + (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0);
+   struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
+   struct ac_arg input_vgpr_param[10];
+   LLVMValueRef input_vgprs[10];
+   unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + num_input_vgprs;
+   unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
+   returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * sizeof(LLVMTypeRef));
+   num_returns = 0;
+
+   /* Declare input and output SGPRs. */
+   for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &input_sgpr_param[i]);
+      returns[num_returns++] = ctx->ac.i32;
+   }
+
+   struct ac_arg merged_wave_info = input_sgpr_param[3];
+
+   /* Preloaded VGPRs (outputs must be floats) */
+   for (i = 0; i < num_input_vgprs; i++) {
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
+      returns[num_returns++] = ctx->ac.f32;
+   }
+
+   /* Vertex load indices. */
+   for (i = 0; i < key->vs_prolog.num_inputs; i++)
+      returns[num_returns++] = ctx->ac.f32;
+
+   /* Create the function. */
+   si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
+   func = ctx->main_fn;
+
+   for (i = 0; i < num_input_vgprs; i++) {
+      input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
+   }
+
+   if (key->vs_prolog.num_merged_next_stage_vgprs) {
+      if (!key->vs_prolog.is_monolithic)
+         si_init_exec_from_input(ctx, merged_wave_info, 0);
+
+      if (key->vs_prolog.as_ls && ctx->screen->info.has_ls_vgpr_init_bug) {
+         /* If there are no HS threads, SPI loads the LS VGPRs
+          * starting at VGPR 0. Shift them back to where they
+          * belong.
+          */
+         LLVMValueRef has_hs_threads =
+            LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
+                          si_unpack_param(ctx, input_sgpr_param[3], 8, 8), ctx->ac.i32_0, "");
+
+         for (i = 4; i > 0; --i) {
+            input_vgprs[i + 1] = LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
+                                                 input_vgprs[i + 1], input_vgprs[i - 1], "");
+         }
+      }
+   }
+
+   if (key->vs_prolog.gs_fast_launch_tri_list || key->vs_prolog.gs_fast_launch_tri_strip) {
+      LLVMValueRef wave_id, thread_id_in_tg;
+
+      wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
+      thread_id_in_tg =
+         ac_build_imad(&ctx->ac, wave_id, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
+                       ac_get_thread_id(&ctx->ac));
+
+      /* The GS fast launch initializes all VGPRs to the value of
+       * the first thread, so we have to add the thread ID.
+       *
+       * Only these are initialized by the hw:
+       *   VGPR2: Base Primitive ID
+       *   VGPR5: Base Vertex ID
+       *   VGPR6: Instance ID
+       */
+
+      /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
+       * The NGG cull shader will read them from there.
+       */
+      if (key->vs_prolog.gs_fast_launch_tri_list) {
+         input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx01_offset */
+                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
+                                        LLVMConstInt(ctx->ac.i32, 0, 0));
+         input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx23_offset */
+                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
+                                        LLVMConstInt(ctx->ac.i32, 1, 0));
+         input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx45_offset */
+                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
+                                        LLVMConstInt(ctx->ac.i32, 2, 0));
+      } else {
+         assert(key->vs_prolog.gs_fast_launch_tri_strip);
+         LLVMBuilderRef builder = ctx->ac.builder;
+         /* Triangle indices: */
+         LLVMValueRef index[3] = {
+            thread_id_in_tg,
+            LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 1, 0), ""),
+            LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 2, 0), ""),
+         };
+         LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, thread_id_in_tg, ctx->ac.i1, "");
+         LLVMValueRef flatshade_first = LLVMBuildICmp(
+            builder, LLVMIntEQ, si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), ctx->ac.i32_0, "");
+
+         ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, index);
+         input_vgprs[0] = index[0];
+         input_vgprs[1] = index[1];
+         input_vgprs[4] = index[2];
+      }
+
+      /* Triangles always have all edge flags set initially. */
+      input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
+
+      input_vgprs[2] =
+         LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], thread_id_in_tg, ""); /* PrimID */
+      input_vgprs[5] =
+         LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], thread_id_in_tg, ""); /* VertexID */
+      input_vgprs[8] = input_vgprs[6];                                       /* InstanceID */
+   }
+
+   unsigned vertex_id_vgpr = first_vs_vgpr;
+   unsigned instance_id_vgpr = ctx->screen->info.chip_class >= GFX10
+                                  ? first_vs_vgpr + 3
+                                  : first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
+
+   ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
+   ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
+
+   /* InstanceID = VertexID >> 16;
+    * VertexID   = VertexID & 0xffff;
+    */
+   if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
+      ctx->abi.instance_id =
+         LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
+      ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
+                                        LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
+   }
+
+   /* Copy inputs to outputs. This should be no-op, as the registers match,
+    * but it will prevent the compiler from overwriting them unintentionally.
+    */
+   ret = ctx->return_value;
+   for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+      LLVMValueRef p = LLVMGetParam(func, i);
+      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
+   }
+   for (i = 0; i < num_input_vgprs; i++) {
+      LLVMValueRef p = input_vgprs[i];
+
+      if (i == vertex_id_vgpr)
+         p = ctx->abi.vertex_id;
+      else if (i == instance_id_vgpr)
+         p = ctx->abi.instance_id;
+
+      p = ac_to_float(&ctx->ac, p);
+      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, key->vs_prolog.num_input_sgprs + i, "");
+   }
+
+   /* Compute vertex load indices from instance divisors. */
+   LLVMValueRef instance_divisor_constbuf = NULL;
+
+   if (key->vs_prolog.states.instance_divisor_is_fetched) {
+      LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
+      LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
+      instance_divisor_constbuf = ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
+   }
+
+   for (i = 0; i < key->vs_prolog.num_inputs; i++) {
+      bool divisor_is_one = key->vs_prolog.states.instance_divisor_is_one & (1u << i);
+      bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
+      LLVMValueRef index = NULL;
+
+      if (divisor_is_one) {
+         index = ctx->abi.instance_id;
+      } else if (divisor_is_fetched) {
+         LLVMValueRef udiv_factors[4];
+
+         for (unsigned j = 0; j < 4; j++) {
+            udiv_factors[j] = si_buffer_load_const(ctx, instance_divisor_constbuf,
+                                                   LLVMConstInt(ctx->ac.i32, i * 16 + j * 4, 0));
+            udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
+         }
+         /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
+          * Such InstanceID might not be achievable in a reasonable time though.
+          */
+         index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, udiv_factors[0],
+                                        udiv_factors[1], udiv_factors[2], udiv_factors[3]);
+      }
+
+      if (divisor_is_one || divisor_is_fetched) {
+         /* Add StartInstance. */
+         index =
+            LLVMBuildAdd(ctx->ac.builder, index,
+                         LLVMGetParam(ctx->main_fn, user_sgpr_base + SI_SGPR_START_INSTANCE), "");
+      } else {
+         /* VertexID + BaseVertex */
+         index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
+                              LLVMGetParam(func, user_sgpr_base + SI_SGPR_BASE_VERTEX), "");
+      }
+
+      index = ac_to_float(&ctx->ac, index);
+      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, ctx->args.arg_count + i, "");
+   }
+
+   si_llvm_build_ret(ctx, ret);
  }
  
  static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-
-       /* For non-indexed draws, the base vertex set by the driver
-        * (for direct draws) or the CP (for indirect draws) is the
-        * first vertex ID, but GLSL expects 0 to be returned.
-        */
-       LLVMValueRef vs_state = ac_get_arg(&ctx->ac,
-                                          ctx->vs_state_bits);
-       LLVMValueRef indexed;
-
-       indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, "");
-       indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
-
-       return LLVMBuildSelect(ctx->ac.builder, indexed,
-                              ac_get_arg(&ctx->ac, ctx->args.base_vertex),
-                              ctx->ac.i32_0, "");
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+
+   /* For non-indexed draws, the base vertex set by the driver
+    * (for direct draws) or the CP (for indirect draws) is the
+    * first vertex ID, but GLSL expects 0 to be returned.
+    */
+   LLVMValueRef vs_state = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
+   LLVMValueRef indexed;
+
+   indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, "");
+   indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
+
+   return LLVMBuildSelect(ctx->ac.builder, indexed, ac_get_arg(&ctx->ac, ctx->args.base_vertex),
+                          ctx->ac.i32_0, "");
  }
  
  void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
  {
-       struct si_shader *shader = ctx->shader;
-
-       if (shader->key.as_ls)
-               ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
-       else if (shader->key.as_es)
-               ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
-       else if (shader->key.opt.vs_as_prim_discard_cs)
-               ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
-       else if (ngg_cull_shader)
-               ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
-       else if (shader->key.as_ngg)
-               ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
-       else
-               ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
-
-       ctx->abi.load_base_vertex = get_base_vertex;
+   struct si_shader *shader = ctx->shader;
+
+   if (shader->key.as_ls)
+      ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
+   else if (shader->key.as_es)
+      ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
+   else if (shader->key.opt.vs_as_prim_discard_cs)
+      ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
+   else if (ngg_cull_shader)
+      ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
+   else if (shader->key.as_ngg)
+      ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
+   else
+      ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
+
+   ctx->abi.load_base_vertex = get_base_vertex;
  }
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c

index 49393af3abd051548f09ead4a9b59d9742ac15e5..ddbb5c5c9c705fad609e3614ddf1af219b36410a 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -22,914 +22,865 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "si_shader_internal.h"
-#include "si_pipe.h"
-
  #include "ac_nir_to_llvm.h"
-
-#include "tgsi/tgsi_from_mesa.h"
-
  #include "compiler/nir/nir.h"
-#include "compiler/nir_types.h"
  #include "compiler/nir/nir_builder.h"
  #include "compiler/nir/nir_deref.h"
+#include "compiler/nir_types.h"
+#include "si_pipe.h"
+#include "si_shader_internal.h"
+#include "tgsi/tgsi_from_mesa.h"
  
  static const nir_deref_instr *tex_get_texture_deref(nir_tex_instr *instr)
  {
-       for (unsigned i = 0; i < instr->num_srcs; i++) {
-               switch (instr->src[i].src_type) {
-               case nir_tex_src_texture_deref:
-                       return nir_src_as_deref(instr->src[i].src);
-               default:
-                       break;
-               }
-       }
-
-       return NULL;
+   for (unsigned i = 0; i < instr->num_srcs; i++) {
+      switch (instr->src[i].src_type) {
+      case nir_tex_src_texture_deref:
+         return nir_src_as_deref(instr->src[i].src);
+      default:
+         break;
+      }
+   }
+
+   return NULL;
  }
  
-static nir_variable* intrinsic_get_var(nir_intrinsic_instr *instr)
+static nir_variable *intrinsic_get_var(nir_intrinsic_instr *instr)
  {
-       return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[0]));
+   return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[0]));
  }
  
-static void gather_usage_helper(const nir_deref_instr **deref_ptr,
-                               unsigned location,
-                               uint8_t mask,
-                               uint8_t *usage_mask)
+static void gather_usage_helper(const nir_deref_instr **deref_ptr, unsigned location, uint8_t mask,
+                                uint8_t *usage_mask)
  {
-       for (; *deref_ptr; deref_ptr++) {
-               const nir_deref_instr *deref = *deref_ptr;
-               switch (deref->deref_type) {
-               case nir_deref_type_array: {
-                       unsigned elem_size =
-                               glsl_count_attribute_slots(deref->type, false);
-                       if (nir_src_is_const(deref->arr.index)) {
-                               location += elem_size * nir_src_as_uint(deref->arr.index);
-                       } else {
-                               unsigned array_elems =
-                                       glsl_get_length(deref_ptr[-1]->type);
-                               for (unsigned i = 0; i < array_elems; i++) {
-                                       gather_usage_helper(deref_ptr + 1,
-                                                           location + elem_size * i,
-                                                           mask, usage_mask);
-                               }
-                               return;
-                       }
-                       break;
-               }
-               case nir_deref_type_struct: {
-                       const struct glsl_type *parent_type =
-                               deref_ptr[-1]->type;
-                       unsigned index = deref->strct.index;
-                       for (unsigned i = 0; i < index; i++) {
-                               const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
-                               location += glsl_count_attribute_slots(ft, false);
-                       }
-                       break;
-               }
-               default:
-                       unreachable("Unhandled deref type in gather_components_used_helper");
-               }
-       }
-
-       usage_mask[location] |= mask & 0xf;
-       if (mask & 0xf0)
-               usage_mask[location + 1] |= (mask >> 4) & 0xf;
+   for (; *deref_ptr; deref_ptr++) {
+      const nir_deref_instr *deref = *deref_ptr;
+      switch (deref->deref_type) {
+      case nir_deref_type_array: {
+         unsigned elem_size = glsl_count_attribute_slots(deref->type, false);
+         if (nir_src_is_const(deref->arr.index)) {
+            location += elem_size * nir_src_as_uint(deref->arr.index);
+         } else {
+            unsigned array_elems = glsl_get_length(deref_ptr[-1]->type);
+            for (unsigned i = 0; i < array_elems; i++) {
+               gather_usage_helper(deref_ptr + 1, location + elem_size * i, mask, usage_mask);
+            }
+            return;
+         }
+         break;
+      }
+      case nir_deref_type_struct: {
+         const struct glsl_type *parent_type = deref_ptr[-1]->type;
+         unsigned index = deref->strct.index;
+         for (unsigned i = 0; i < index; i++) {
+            const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
+            location += glsl_count_attribute_slots(ft, false);
+         }
+         break;
+      }
+      default:
+         unreachable("Unhandled deref type in gather_components_used_helper");
+      }
+   }
+
+   usage_mask[location] |= mask & 0xf;
+   if (mask & 0xf0)
+      usage_mask[location + 1] |= (mask >> 4) & 0xf;
  }
  
-static void gather_usage(const nir_deref_instr *deref,
-                        uint8_t mask,
-                        uint8_t *usage_mask)
+static void gather_usage(const nir_deref_instr *deref, uint8_t mask, uint8_t *usage_mask)
  {
-       nir_deref_path path;
-       nir_deref_path_init(&path, (nir_deref_instr *)deref, NULL);
-
-       unsigned location_frac = path.path[0]->var->data.location_frac;
-       if (glsl_type_is_64bit(deref->type)) {
-               uint8_t new_mask = 0;
-               for (unsigned i = 0; i < 4; i++) {
-                       if (mask & (1 << i))
-                               new_mask |= 0x3 << (2 * i);
-               }
-               mask = new_mask << location_frac;
-       } else {
-               mask <<= location_frac;
-               mask &= 0xf;
-       }
-
-       gather_usage_helper((const nir_deref_instr **)&path.path[1],
-                           path.path[0]->var->data.driver_location,
-                           mask, usage_mask);
-
-       nir_deref_path_finish(&path);
+   nir_deref_path path;
+   nir_deref_path_init(&path, (nir_deref_instr *)deref, NULL);
+
+   unsigned location_frac = path.path[0]->var->data.location_frac;
+   if (glsl_type_is_64bit(deref->type)) {
+      uint8_t new_mask = 0;
+      for (unsigned i = 0; i < 4; i++) {
+         if (mask & (1 << i))
+            new_mask |= 0x3 << (2 * i);
+      }
+      mask = new_mask << location_frac;
+   } else {
+      mask <<= location_frac;
+      mask &= 0xf;
+   }
+
+   gather_usage_helper((const nir_deref_instr **)&path.path[1],
+                       path.path[0]->var->data.driver_location, mask, usage_mask);
+
+   nir_deref_path_finish(&path);
  }
  
  static void gather_intrinsic_load_deref_input_info(const nir_shader *nir,
-                                                  const nir_intrinsic_instr *instr,
-                                                  const nir_deref_instr *deref,
-                                                  struct si_shader_info *info)
+                                                   const nir_intrinsic_instr *instr,
+                                                   const nir_deref_instr *deref,
+                                                   struct si_shader_info *info)
  {
-       switch (nir->info.stage) {
-       case MESA_SHADER_VERTEX:
-               gather_usage(deref, nir_ssa_def_components_read(&instr->dest.ssa),
-                            info->input_usage_mask);
-       default:;
-       }
+   switch (nir->info.stage) {
+   case MESA_SHADER_VERTEX:
+      gather_usage(deref, nir_ssa_def_components_read(&instr->dest.ssa), info->input_usage_mask);
+   default:;
+   }
  }
  
  static void gather_intrinsic_load_deref_output_info(const nir_shader *nir,
-                                                   const nir_intrinsic_instr *instr,
-                                                   nir_variable *var,
-                                                   struct si_shader_info *info)
+                                                    const nir_intrinsic_instr *instr,
+                                                    nir_variable *var, struct si_shader_info *info)
  {
-       assert(var && var->data.mode == nir_var_shader_out);
-
-       switch (nir->info.stage) {
-       case MESA_SHADER_TESS_CTRL:
-               if (var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
-                   var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)
-                       info->reads_tessfactor_outputs = true;
-               else if (var->data.patch)
-                       info->reads_perpatch_outputs = true;
-               else
-                       info->reads_pervertex_outputs = true;
-               break;
-
-       case MESA_SHADER_FRAGMENT:
-               if (var->data.fb_fetch_output)
-                       info->uses_fbfetch = true;
-               break;
-       default:;
-       }
+   assert(var && var->data.mode == nir_var_shader_out);
+
+   switch (nir->info.stage) {
+   case MESA_SHADER_TESS_CTRL:
+      if (var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
+          var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)
+         info->reads_tessfactor_outputs = true;
+      else if (var->data.patch)
+         info->reads_perpatch_outputs = true;
+      else
+         info->reads_pervertex_outputs = true;
+      break;
+
+   case MESA_SHADER_FRAGMENT:
+      if (var->data.fb_fetch_output)
+         info->uses_fbfetch = true;
+      break;
+   default:;
+   }
  }
  
  static void gather_intrinsic_store_deref_output_info(const nir_shader *nir,
-                                                    const nir_intrinsic_instr *instr,
-                                                    const nir_deref_instr *deref,
-                                                    struct si_shader_info *info)
+                                                     const nir_intrinsic_instr *instr,
+                                                     const nir_deref_instr *deref,
+                                                     struct si_shader_info *info)
  {
-       switch (nir->info.stage) {
-       case MESA_SHADER_VERTEX: /* needed by LS, ES */
-       case MESA_SHADER_TESS_EVAL: /* needed by ES */
-       case MESA_SHADER_GEOMETRY:
-               gather_usage(deref, nir_intrinsic_write_mask(instr),
-                            info->output_usagemask);
-               break;
-       default:;
-       }
+   switch (nir->info.stage) {
+   case MESA_SHADER_VERTEX:    /* needed by LS, ES */
+   case MESA_SHADER_TESS_EVAL: /* needed by ES */
+   case MESA_SHADER_GEOMETRY:
+      gather_usage(deref, nir_intrinsic_write_mask(instr), info->output_usagemask);
+      break;
+   default:;
+   }
  }
  
-static void scan_instruction(const struct nir_shader *nir,
-                            struct si_shader_info *info,
-                            nir_instr *instr)
+static void scan_instruction(const struct nir_shader *nir, struct si_shader_info *info,
+                             nir_instr *instr)
  {
-       if (instr->type == nir_instr_type_alu) {
-               nir_alu_instr *alu = nir_instr_as_alu(instr);
-
-               switch (alu->op) {
-               case nir_op_fddx:
-               case nir_op_fddy:
-               case nir_op_fddx_fine:
-               case nir_op_fddy_fine:
-               case nir_op_fddx_coarse:
-               case nir_op_fddy_coarse:
-                       info->uses_derivatives = true;
-                       break;
-               default:
-                       break;
-               }
-       } else if (instr->type == nir_instr_type_tex) {
-               nir_tex_instr *tex = nir_instr_as_tex(instr);
-               const nir_deref_instr *deref = tex_get_texture_deref(tex);
-               nir_variable *var = deref ? nir_deref_instr_get_variable(deref) : NULL;
-
-               if (!var) {
-                       info->samplers_declared |=
-                               u_bit_consecutive(tex->sampler_index, 1);
-               } else {
-                       if (deref->mode != nir_var_uniform || var->data.bindless)
-                               info->uses_bindless_samplers = true;
-               }
-
-               switch (tex->op) {
-               case nir_texop_tex:
-               case nir_texop_txb:
-               case nir_texop_lod:
-                       info->uses_derivatives = true;
-                       break;
-               default:
-                       break;
-               }
-       } else if (instr->type == nir_instr_type_intrinsic) {
-               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-               switch (intr->intrinsic) {
-               case nir_intrinsic_load_front_face:
-                       info->uses_frontface = 1;
-                       break;
-               case nir_intrinsic_load_instance_id:
-                       info->uses_instanceid = 1;
-                       break;
-               case nir_intrinsic_load_invocation_id:
-                       info->uses_invocationid = true;
-                       break;
-               case nir_intrinsic_load_num_work_groups:
-                       info->uses_grid_size = true;
-                       break;
-               case nir_intrinsic_load_local_invocation_index:
-               case nir_intrinsic_load_subgroup_id:
-               case nir_intrinsic_load_num_subgroups:
-                       info->uses_subgroup_info = true;
-                       break;
-               case nir_intrinsic_load_local_group_size:
-                       /* The block size is translated to IMM with a fixed block size. */
-                       if (info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
-                               info->uses_block_size = true;
-                       break;
-               case nir_intrinsic_load_local_invocation_id:
-               case nir_intrinsic_load_work_group_id: {
-                       unsigned mask = nir_ssa_def_components_read(&intr->dest.ssa);
-                       while (mask) {
-                               unsigned i = u_bit_scan(&mask);
-
-                               if (intr->intrinsic == nir_intrinsic_load_work_group_id)
-                                       info->uses_block_id[i] = true;
-                               else
-                                       info->uses_thread_id[i] = true;
-                       }
-                       break;
-               }
-               case nir_intrinsic_load_vertex_id:
-                       info->uses_vertexid = 1;
-                       break;
-               case nir_intrinsic_load_vertex_id_zero_base:
-                       info->uses_vertexid_nobase = 1;
-                       break;
-               case nir_intrinsic_load_base_vertex:
-                       info->uses_basevertex = 1;
-                       break;
-               case nir_intrinsic_load_draw_id:
-                       info->uses_drawid = 1;
-                       break;
-               case nir_intrinsic_load_primitive_id:
-                       info->uses_primid = 1;
-                       break;
-               case nir_intrinsic_load_sample_mask_in:
-                       info->reads_samplemask = true;
-                       break;
-               case nir_intrinsic_load_tess_level_inner:
-               case nir_intrinsic_load_tess_level_outer:
-                       info->reads_tess_factors = true;
-                       break;
-               case nir_intrinsic_bindless_image_load:
-               case nir_intrinsic_bindless_image_size:
-               case nir_intrinsic_bindless_image_samples:
-                       info->uses_bindless_images = true;
-                       break;
-               case nir_intrinsic_bindless_image_store:
-                       info->uses_bindless_images = true;
-                       info->writes_memory = true;
-                       info->num_memory_instructions++; /* we only care about stores */
-                       break;
-               case nir_intrinsic_image_deref_store:
-                       info->writes_memory = true;
-                       info->num_memory_instructions++; /* we only care about stores */
-                       break;
-               case nir_intrinsic_bindless_image_atomic_add:
-               case nir_intrinsic_bindless_image_atomic_imin:
-               case nir_intrinsic_bindless_image_atomic_umin:
-               case nir_intrinsic_bindless_image_atomic_imax:
-               case nir_intrinsic_bindless_image_atomic_umax:
-               case nir_intrinsic_bindless_image_atomic_and:
-               case nir_intrinsic_bindless_image_atomic_or:
-               case nir_intrinsic_bindless_image_atomic_xor:
-               case nir_intrinsic_bindless_image_atomic_exchange:
-               case nir_intrinsic_bindless_image_atomic_comp_swap:
-                       info->uses_bindless_images = true;
-                       info->writes_memory = true;
-                       info->num_memory_instructions++; /* we only care about stores */
-                       break;
-               case nir_intrinsic_image_deref_atomic_add:
-               case nir_intrinsic_image_deref_atomic_imin:
-               case nir_intrinsic_image_deref_atomic_umin:
-               case nir_intrinsic_image_deref_atomic_imax:
-               case nir_intrinsic_image_deref_atomic_umax:
-               case nir_intrinsic_image_deref_atomic_and:
-               case nir_intrinsic_image_deref_atomic_or:
-               case nir_intrinsic_image_deref_atomic_xor:
-               case nir_intrinsic_image_deref_atomic_exchange:
-               case nir_intrinsic_image_deref_atomic_comp_swap:
-               case nir_intrinsic_image_deref_atomic_inc_wrap:
-               case nir_intrinsic_image_deref_atomic_dec_wrap:
-                       info->writes_memory = true;
-                       info->num_memory_instructions++; /* we only care about stores */
-                       break;
-               case nir_intrinsic_store_ssbo:
-               case nir_intrinsic_ssbo_atomic_add:
-               case nir_intrinsic_ssbo_atomic_imin:
-               case nir_intrinsic_ssbo_atomic_umin:
-               case nir_intrinsic_ssbo_atomic_imax:
-               case nir_intrinsic_ssbo_atomic_umax:
-               case nir_intrinsic_ssbo_atomic_and:
-               case nir_intrinsic_ssbo_atomic_or:
-               case nir_intrinsic_ssbo_atomic_xor:
-               case nir_intrinsic_ssbo_atomic_exchange:
-               case nir_intrinsic_ssbo_atomic_comp_swap:
-                       info->writes_memory = true;
-                       info->num_memory_instructions++; /* we only care about stores */
-                       break;
-               case nir_intrinsic_load_color0:
-               case nir_intrinsic_load_color1: {
-                       unsigned index = intr->intrinsic == nir_intrinsic_load_color1;
-                       uint8_t mask = nir_ssa_def_components_read(&intr->dest.ssa);
-                       info->colors_read |= mask << (index * 4);
-                       break;
-               }
-               case nir_intrinsic_load_barycentric_pixel:
-               case nir_intrinsic_load_barycentric_centroid:
-               case nir_intrinsic_load_barycentric_sample:
-               case nir_intrinsic_load_barycentric_at_offset: /* uses center */
-               case nir_intrinsic_load_barycentric_at_sample: { /* uses center */
-                       unsigned mode = nir_intrinsic_interp_mode(intr);
-
-                       if (mode == INTERP_MODE_FLAT)
-                               break;
-
-                       if (mode == INTERP_MODE_NOPERSPECTIVE) {
-                               if (intr->intrinsic == nir_intrinsic_load_barycentric_sample)
-                                       info->uses_linear_sample = true;
-                               else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid)
-                                       info->uses_linear_centroid = true;
-                               else
-                                       info->uses_linear_center = true;
-
-                               if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
-                                       info->uses_linear_opcode_interp_sample = true;
-                       } else {
-                               if (intr->intrinsic == nir_intrinsic_load_barycentric_sample)
-                                       info->uses_persp_sample = true;
-                               else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid)
-                                       info->uses_persp_centroid = true;
-                               else
-                                       info->uses_persp_center = true;
-
-                               if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
-                                       info->uses_persp_opcode_interp_sample = true;
-                       }
-                       break;
-               }
-               case nir_intrinsic_load_deref: {
-                       nir_variable *var = intrinsic_get_var(intr);
-                       nir_variable_mode mode = var->data.mode;
-
-                       if (mode == nir_var_shader_in) {
-                               /* PS inputs use the interpolated load intrinsics. */
-                               assert(nir->info.stage != MESA_SHADER_FRAGMENT);
-                               gather_intrinsic_load_deref_input_info(nir, intr,
-                                                                      nir_src_as_deref(intr->src[0]), info);
-                       } else if (mode == nir_var_shader_out) {
-                               gather_intrinsic_load_deref_output_info(nir, intr, var, info);
-                       }
-                       break;
-               }
-               case nir_intrinsic_store_deref: {
-                       nir_variable *var = intrinsic_get_var(intr);
-
-                       if (var->data.mode == nir_var_shader_out)
-                               gather_intrinsic_store_deref_output_info(nir, intr,
-                                                                        nir_src_as_deref(intr->src[0]), info);
-                       break;
-               }
-               case nir_intrinsic_interp_deref_at_centroid:
-               case nir_intrinsic_interp_deref_at_sample:
-               case nir_intrinsic_interp_deref_at_offset:
-                       unreachable("interp opcodes should have been lowered");
-                       break;
-               default:
-                       break;
-               }
-       }
+   if (instr->type == nir_instr_type_alu) {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+      switch (alu->op) {
+      case nir_op_fddx:
+      case nir_op_fddy:
+      case nir_op_fddx_fine:
+      case nir_op_fddy_fine:
+      case nir_op_fddx_coarse:
+      case nir_op_fddy_coarse:
+         info->uses_derivatives = true;
+         break;
+      default:
+         break;
+      }
+   } else if (instr->type == nir_instr_type_tex) {
+      nir_tex_instr *tex = nir_instr_as_tex(instr);
+      const nir_deref_instr *deref = tex_get_texture_deref(tex);
+      nir_variable *var = deref ? nir_deref_instr_get_variable(deref) : NULL;
+
+      if (!var) {
+         info->samplers_declared |= u_bit_consecutive(tex->sampler_index, 1);
+      } else {
+         if (deref->mode != nir_var_uniform || var->data.bindless)
+            info->uses_bindless_samplers = true;
+      }
+
+      switch (tex->op) {
+      case nir_texop_tex:
+      case nir_texop_txb:
+      case nir_texop_lod:
+         info->uses_derivatives = true;
+         break;
+      default:
+         break;
+      }
+   } else if (instr->type == nir_instr_type_intrinsic) {
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+      switch (intr->intrinsic) {
+      case nir_intrinsic_load_front_face:
+         info->uses_frontface = 1;
+         break;
+      case nir_intrinsic_load_instance_id:
+         info->uses_instanceid = 1;
+         break;
+      case nir_intrinsic_load_invocation_id:
+         info->uses_invocationid = true;
+         break;
+      case nir_intrinsic_load_num_work_groups:
+         info->uses_grid_size = true;
+         break;
+      case nir_intrinsic_load_local_invocation_index:
+      case nir_intrinsic_load_subgroup_id:
+      case nir_intrinsic_load_num_subgroups:
+         info->uses_subgroup_info = true;
+         break;
+      case nir_intrinsic_load_local_group_size:
+         /* The block size is translated to IMM with a fixed block size. */
+         if (info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
+            info->uses_block_size = true;
+         break;
+      case nir_intrinsic_load_local_invocation_id:
+      case nir_intrinsic_load_work_group_id: {
+         unsigned mask = nir_ssa_def_components_read(&intr->dest.ssa);
+         while (mask) {
+            unsigned i = u_bit_scan(&mask);
+
+            if (intr->intrinsic == nir_intrinsic_load_work_group_id)
+               info->uses_block_id[i] = true;
+            else
+               info->uses_thread_id[i] = true;
+         }
+         break;
+      }
+      case nir_intrinsic_load_vertex_id:
+         info->uses_vertexid = 1;
+         break;
+      case nir_intrinsic_load_vertex_id_zero_base:
+         info->uses_vertexid_nobase = 1;
+         break;
+      case nir_intrinsic_load_base_vertex:
+         info->uses_basevertex = 1;
+         break;
+      case nir_intrinsic_load_draw_id:
+         info->uses_drawid = 1;
+         break;
+      case nir_intrinsic_load_primitive_id:
+         info->uses_primid = 1;
+         break;
+      case nir_intrinsic_load_sample_mask_in:
+         info->reads_samplemask = true;
+         break;
+      case nir_intrinsic_load_tess_level_inner:
+      case nir_intrinsic_load_tess_level_outer:
+         info->reads_tess_factors = true;
+         break;
+      case nir_intrinsic_bindless_image_load:
+      case nir_intrinsic_bindless_image_size:
+      case nir_intrinsic_bindless_image_samples:
+         info->uses_bindless_images = true;
+         break;
+      case nir_intrinsic_bindless_image_store:
+         info->uses_bindless_images = true;
+         info->writes_memory = true;
+         info->num_memory_instructions++; /* we only care about stores */
+         break;
+      case nir_intrinsic_image_deref_store:
+         info->writes_memory = true;
+         info->num_memory_instructions++; /* we only care about stores */
+         break;
+      case nir_intrinsic_bindless_image_atomic_add:
+      case nir_intrinsic_bindless_image_atomic_imin:
+      case nir_intrinsic_bindless_image_atomic_umin:
+      case nir_intrinsic_bindless_image_atomic_imax:
+      case nir_intrinsic_bindless_image_atomic_umax:
+      case nir_intrinsic_bindless_image_atomic_and:
+      case nir_intrinsic_bindless_image_atomic_or:
+      case nir_intrinsic_bindless_image_atomic_xor:
+      case nir_intrinsic_bindless_image_atomic_exchange:
+      case nir_intrinsic_bindless_image_atomic_comp_swap:
+         info->uses_bindless_images = true;
+         info->writes_memory = true;
+         info->num_memory_instructions++; /* we only care about stores */
+         break;
+      case nir_intrinsic_image_deref_atomic_add:
+      case nir_intrinsic_image_deref_atomic_imin:
+      case nir_intrinsic_image_deref_atomic_umin:
+      case nir_intrinsic_image_deref_atomic_imax:
+      case nir_intrinsic_image_deref_atomic_umax:
+      case nir_intrinsic_image_deref_atomic_and:
+      case nir_intrinsic_image_deref_atomic_or:
+      case nir_intrinsic_image_deref_atomic_xor:
+      case nir_intrinsic_image_deref_atomic_exchange:
+      case nir_intrinsic_image_deref_atomic_comp_swap:
+      case nir_intrinsic_image_deref_atomic_inc_wrap:
+      case nir_intrinsic_image_deref_atomic_dec_wrap:
+         info->writes_memory = true;
+         info->num_memory_instructions++; /* we only care about stores */
+         break;
+      case nir_intrinsic_store_ssbo:
+      case nir_intrinsic_ssbo_atomic_add:
+      case nir_intrinsic_ssbo_atomic_imin:
+      case nir_intrinsic_ssbo_atomic_umin:
+      case nir_intrinsic_ssbo_atomic_imax:
+      case nir_intrinsic_ssbo_atomic_umax:
+      case nir_intrinsic_ssbo_atomic_and:
+      case nir_intrinsic_ssbo_atomic_or:
+      case nir_intrinsic_ssbo_atomic_xor:
+      case nir_intrinsic_ssbo_atomic_exchange:
+      case nir_intrinsic_ssbo_atomic_comp_swap:
+         info->writes_memory = true;
+         info->num_memory_instructions++; /* we only care about stores */
+         break;
+      case nir_intrinsic_load_color0:
+      case nir_intrinsic_load_color1: {
+         unsigned index = intr->intrinsic == nir_intrinsic_load_color1;
+         uint8_t mask = nir_ssa_def_components_read(&intr->dest.ssa);
+         info->colors_read |= mask << (index * 4);
+         break;
+      }
+      case nir_intrinsic_load_barycentric_pixel:
+      case nir_intrinsic_load_barycentric_centroid:
+      case nir_intrinsic_load_barycentric_sample:
+      case nir_intrinsic_load_barycentric_at_offset:   /* uses center */
+      case nir_intrinsic_load_barycentric_at_sample: { /* uses center */
+         unsigned mode = nir_intrinsic_interp_mode(intr);
+
+         if (mode == INTERP_MODE_FLAT)
+            break;
+
+         if (mode == INTERP_MODE_NOPERSPECTIVE) {
+            if (intr->intrinsic == nir_intrinsic_load_barycentric_sample)
+               info->uses_linear_sample = true;
+            else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid)
+               info->uses_linear_centroid = true;
+            else
+               info->uses_linear_center = true;
+
+            if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
+               info->uses_linear_opcode_interp_sample = true;
+         } else {
+            if (intr->intrinsic == nir_intrinsic_load_barycentric_sample)
+               info->uses_persp_sample = true;
+            else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid)
+               info->uses_persp_centroid = true;
+            else
+               info->uses_persp_center = true;
+
+            if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
+               info->uses_persp_opcode_interp_sample = true;
+         }
+         break;
+      }
+      case nir_intrinsic_load_deref: {
+         nir_variable *var = intrinsic_get_var(intr);
+         nir_variable_mode mode = var->data.mode;
+
+         if (mode == nir_var_shader_in) {
+            /* PS inputs use the interpolated load intrinsics. */
+            assert(nir->info.stage != MESA_SHADER_FRAGMENT);
+            gather_intrinsic_load_deref_input_info(nir, intr, nir_src_as_deref(intr->src[0]), info);
+         } else if (mode == nir_var_shader_out) {
+            gather_intrinsic_load_deref_output_info(nir, intr, var, info);
+         }
+         break;
+      }
+      case nir_intrinsic_store_deref: {
+         nir_variable *var = intrinsic_get_var(intr);
+
+         if (var->data.mode == nir_var_shader_out)
+            gather_intrinsic_store_deref_output_info(nir, intr, nir_src_as_deref(intr->src[0]),
+                                                     info);
+         break;
+      }
+      case nir_intrinsic_interp_deref_at_centroid:
+      case nir_intrinsic_interp_deref_at_sample:
+      case nir_intrinsic_interp_deref_at_offset:
+         unreachable("interp opcodes should have been lowered");
+         break;
+      default:
+         break;
+      }
+   }
  }
  
-static void scan_output_slot(const nir_variable *var,
-                            unsigned var_idx,
-                            unsigned component, unsigned num_components,
-                            struct si_shader_info *info)
+static void scan_output_slot(const nir_variable *var, unsigned var_idx, unsigned component,
+                             unsigned num_components, struct si_shader_info *info)
  {
-       assert(component + num_components <= 4);
-       assert(component < 4);
-
-       unsigned semantic_name, semantic_index;
-
-       unsigned location = var->data.location + var_idx;
-       unsigned drv_location = var->data.driver_location + var_idx;
-
-       if (info->processor == PIPE_SHADER_FRAGMENT) {
-               tgsi_get_gl_frag_result_semantic(location,
-                       &semantic_name, &semantic_index);
-
-               /* Adjust for dual source blending */
-               if (var->data.index > 0) {
-                       semantic_index++;
-               }
-       } else {
-               tgsi_get_gl_varying_semantic(location, true,
-                                            &semantic_name, &semantic_index);
-       }
-
-       ubyte usagemask = ((1 << num_components) - 1) << component;
-
-       unsigned gs_out_streams;
-       if (var->data.stream & NIR_STREAM_PACKED) {
-               gs_out_streams = var->data.stream & ~NIR_STREAM_PACKED;
-       } else {
-               assert(var->data.stream < 4);
-               gs_out_streams = 0;
-               for (unsigned j = 0; j < num_components; ++j)
-                       gs_out_streams |= var->data.stream << (2 * (component + j));
-       }
-
-       unsigned streamx = gs_out_streams & 3;
-       unsigned streamy = (gs_out_streams >> 2) & 3;
-       unsigned streamz = (gs_out_streams >> 4) & 3;
-       unsigned streamw = (gs_out_streams >> 6) & 3;
-
-       if (usagemask & TGSI_WRITEMASK_X) {
-               info->output_streams[drv_location] |= streamx;
-               info->num_stream_output_components[streamx]++;
-       }
-       if (usagemask & TGSI_WRITEMASK_Y) {
-               info->output_streams[drv_location] |= streamy << 2;
-               info->num_stream_output_components[streamy]++;
-       }
-       if (usagemask & TGSI_WRITEMASK_Z) {
-               info->output_streams[drv_location] |= streamz << 4;
-               info->num_stream_output_components[streamz]++;
-       }
-       if (usagemask & TGSI_WRITEMASK_W) {
-               info->output_streams[drv_location] |= streamw << 6;
-               info->num_stream_output_components[streamw]++;
-       }
-
-       info->output_semantic_name[drv_location] = semantic_name;
-       info->output_semantic_index[drv_location] = semantic_index;
-
-       switch (semantic_name) {
-       case TGSI_SEMANTIC_PRIMID:
-               info->writes_primid = true;
-               break;
-       case TGSI_SEMANTIC_VIEWPORT_INDEX:
-               info->writes_viewport_index = true;
-               break;
-       case TGSI_SEMANTIC_LAYER:
-               info->writes_layer = true;
-               break;
-       case TGSI_SEMANTIC_PSIZE:
-               info->writes_psize = true;
-               break;
-       case TGSI_SEMANTIC_CLIPVERTEX:
-               info->writes_clipvertex = true;
-               break;
-       case TGSI_SEMANTIC_COLOR:
-               info->colors_written |= 1 << semantic_index;
-               break;
-       case TGSI_SEMANTIC_STENCIL:
-               info->writes_stencil = true;
-               break;
-       case TGSI_SEMANTIC_SAMPLEMASK:
-               info->writes_samplemask = true;
-               break;
-       case TGSI_SEMANTIC_EDGEFLAG:
-               info->writes_edgeflag = true;
-               break;
-       case TGSI_SEMANTIC_POSITION:
-               if (info->processor == PIPE_SHADER_FRAGMENT)
-                       info->writes_z = true;
-               else
-                       info->writes_position = true;
-               break;
-       }
+   assert(component + num_components <= 4);
+   assert(component < 4);
+
+   unsigned semantic_name, semantic_index;
+
+   unsigned location = var->data.location + var_idx;
+   unsigned drv_location = var->data.driver_location + var_idx;
+
+   if (info->processor == PIPE_SHADER_FRAGMENT) {
+      tgsi_get_gl_frag_result_semantic(location, &semantic_name, &semantic_index);
+
+      /* Adjust for dual source blending */
+      if (var->data.index > 0) {
+         semantic_index++;
+      }
+   } else {
+      tgsi_get_gl_varying_semantic(location, true, &semantic_name, &semantic_index);
+   }
+
+   ubyte usagemask = ((1 << num_components) - 1) << component;
+
+   unsigned gs_out_streams;
+   if (var->data.stream & NIR_STREAM_PACKED) {
+      gs_out_streams = var->data.stream & ~NIR_STREAM_PACKED;
+   } else {
+      assert(var->data.stream < 4);
+      gs_out_streams = 0;
+      for (unsigned j = 0; j < num_components; ++j)
+         gs_out_streams |= var->data.stream << (2 * (component + j));
+   }
+
+   unsigned streamx = gs_out_streams & 3;
+   unsigned streamy = (gs_out_streams >> 2) & 3;
+   unsigned streamz = (gs_out_streams >> 4) & 3;
+   unsigned streamw = (gs_out_streams >> 6) & 3;
+
+   if (usagemask & TGSI_WRITEMASK_X) {
+      info->output_streams[drv_location] |= streamx;
+      info->num_stream_output_components[streamx]++;
+   }
+   if (usagemask & TGSI_WRITEMASK_Y) {
+      info->output_streams[drv_location] |= streamy << 2;
+      info->num_stream_output_components[streamy]++;
+   }
+   if (usagemask & TGSI_WRITEMASK_Z) {
+      info->output_streams[drv_location] |= streamz << 4;
+      info->num_stream_output_components[streamz]++;
+   }
+   if (usagemask & TGSI_WRITEMASK_W) {
+      info->output_streams[drv_location] |= streamw << 6;
+      info->num_stream_output_components[streamw]++;
+   }
+
+   info->output_semantic_name[drv_location] = semantic_name;
+   info->output_semantic_index[drv_location] = semantic_index;
+
+   switch (semantic_name) {
+   case TGSI_SEMANTIC_PRIMID:
+      info->writes_primid = true;
+      break;
+   case TGSI_SEMANTIC_VIEWPORT_INDEX:
+      info->writes_viewport_index = true;
+      break;
+   case TGSI_SEMANTIC_LAYER:
+      info->writes_layer = true;
+      break;
+   case TGSI_SEMANTIC_PSIZE:
+      info->writes_psize = true;
+      break;
+   case TGSI_SEMANTIC_CLIPVERTEX:
+      info->writes_clipvertex = true;
+      break;
+   case TGSI_SEMANTIC_COLOR:
+      info->colors_written |= 1 << semantic_index;
+      break;
+   case TGSI_SEMANTIC_STENCIL:
+      info->writes_stencil = true;
+      break;
+   case TGSI_SEMANTIC_SAMPLEMASK:
+      info->writes_samplemask = true;
+      break;
+   case TGSI_SEMANTIC_EDGEFLAG:
+      info->writes_edgeflag = true;
+      break;
+   case TGSI_SEMANTIC_POSITION:
+      if (info->processor == PIPE_SHADER_FRAGMENT)
+         info->writes_z = true;
+      else
+         info->writes_position = true;
+      break;
+   }
  }
  
-static void scan_output_helper(const nir_variable *var,
-                              unsigned location,
-                              const struct glsl_type *type,
-                              struct si_shader_info *info)
+static void scan_output_helper(const nir_variable *var, unsigned location,
+                               const struct glsl_type *type, struct si_shader_info *info)
  {
-       if (glsl_type_is_struct(type) || glsl_type_is_interface(type)) {
-               for (unsigned i = 0; i < glsl_get_length(type); i++) {
-                       const struct glsl_type *ft = glsl_get_struct_field(type, i);
-                       scan_output_helper(var, location, ft, info);
-                       location += glsl_count_attribute_slots(ft, false);
-               }
-       } else if (glsl_type_is_array_or_matrix(type)) {
-               const struct glsl_type *elem_type =
-                       glsl_get_array_element(type);
-               unsigned num_elems = glsl_get_length(type);
-               if (var->data.compact) {
-                       assert(glsl_type_is_scalar(elem_type));
-                       assert(glsl_get_bit_size(elem_type) == 32);
-                       unsigned component = var->data.location_frac;
-                       scan_output_slot(var, location, component,
-                                        MIN2(num_elems, 4 - component), info);
-                       if (component + num_elems > 4) {
-                               scan_output_slot(var, location + 1, 0,
-                                                component + num_elems - 4, info);
-                       }
-
-               } else {
-                       unsigned elem_count = glsl_count_attribute_slots(elem_type, false);
-                       for (unsigned i = 0; i < num_elems; i++) {
-                               scan_output_helper(var, location, elem_type, info);
-                               location += elem_count;
-                       }
-               }
-       } else if (glsl_type_is_dual_slot(type)) {
-               unsigned component = var->data.location_frac;
-               scan_output_slot(var, location, component, 4 - component, info);
-               scan_output_slot(var, location + 1, 0, component + 2 * glsl_get_components(type) - 4,
-                                info);
-       } else {
-               unsigned component = var->data.location_frac;
-               assert(glsl_type_is_vector_or_scalar(type));
-               unsigned num_components = glsl_get_components(type);
-               if (glsl_type_is_64bit(type))
-                       num_components *= 2;
-               scan_output_slot(var, location, component, num_components, info);
-       }
+   if (glsl_type_is_struct(type) || glsl_type_is_interface(type)) {
+      for (unsigned i = 0; i < glsl_get_length(type); i++) {
+         const struct glsl_type *ft = glsl_get_struct_field(type, i);
+         scan_output_helper(var, location, ft, info);
+         location += glsl_count_attribute_slots(ft, false);
+      }
+   } else if (glsl_type_is_array_or_matrix(type)) {
+      const struct glsl_type *elem_type = glsl_get_array_element(type);
+      unsigned num_elems = glsl_get_length(type);
+      if (var->data.compact) {
+         assert(glsl_type_is_scalar(elem_type));
+         assert(glsl_get_bit_size(elem_type) == 32);
+         unsigned component = var->data.location_frac;
+         scan_output_slot(var, location, component, MIN2(num_elems, 4 - component), info);
+         if (component + num_elems > 4) {
+            scan_output_slot(var, location + 1, 0, component + num_elems - 4, info);
+         }
+
+      } else {
+         unsigned elem_count = glsl_count_attribute_slots(elem_type, false);
+         for (unsigned i = 0; i < num_elems; i++) {
+            scan_output_helper(var, location, elem_type, info);
+            location += elem_count;
+         }
+      }
+   } else if (glsl_type_is_dual_slot(type)) {
+      unsigned component = var->data.location_frac;
+      scan_output_slot(var, location, component, 4 - component, info);
+      scan_output_slot(var, location + 1, 0, component + 2 * glsl_get_components(type) - 4, info);
+   } else {
+      unsigned component = var->data.location_frac;
+      assert(glsl_type_is_vector_or_scalar(type));
+      unsigned num_components = glsl_get_components(type);
+      if (glsl_type_is_64bit(type))
+         num_components *= 2;
+      scan_output_slot(var, location, component, num_components, info);
+   }
  }
  
-void si_nir_scan_shader(const struct nir_shader *nir,
-                       struct si_shader_info *info)
+void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info)
  {
-       nir_function *func;
-       unsigned i;
-
-       info->processor = pipe_shader_type_from_mesa(nir->info.stage);
-
-       info->properties[TGSI_PROPERTY_NEXT_SHADER] =
-               pipe_shader_type_from_mesa(nir->info.next_stage);
-
-       if (nir->info.stage == MESA_SHADER_VERTEX) {
-               info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] =
-                       nir->info.vs.window_space_position;
-               info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] =
-                       nir->info.vs.blit_sgprs_amd;
-       }
-
-       if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
-               info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT] =
-                       nir->info.tess.tcs_vertices_out;
-       }
-
-       if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
-               if (nir->info.tess.primitive_mode == GL_ISOLINES)
-                       info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = PIPE_PRIM_LINES;
-               else
-                       info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = nir->info.tess.primitive_mode;
-
-               STATIC_ASSERT((TESS_SPACING_EQUAL + 1) % 3 == PIPE_TESS_SPACING_EQUAL);
-               STATIC_ASSERT((TESS_SPACING_FRACTIONAL_ODD + 1) % 3 ==
-                             PIPE_TESS_SPACING_FRACTIONAL_ODD);
-               STATIC_ASSERT((TESS_SPACING_FRACTIONAL_EVEN + 1) % 3 ==
-                             PIPE_TESS_SPACING_FRACTIONAL_EVEN);
-
-               info->properties[TGSI_PROPERTY_TES_SPACING] = (nir->info.tess.spacing + 1) % 3;
-               info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW] = !nir->info.tess.ccw;
-               info->properties[TGSI_PROPERTY_TES_POINT_MODE] = nir->info.tess.point_mode;
-       }
-
-       if (nir->info.stage == MESA_SHADER_GEOMETRY) {
-               info->properties[TGSI_PROPERTY_GS_INPUT_PRIM] = nir->info.gs.input_primitive;
-               info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM] = nir->info.gs.output_primitive;
-               info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES] = nir->info.gs.vertices_out;
-               info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = nir->info.gs.invocations;
-       }
-
-       if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-               info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] =
-                       nir->info.fs.early_fragment_tests | nir->info.fs.post_depth_coverage;
-               info->properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE] = nir->info.fs.post_depth_coverage;
-
-               if (nir->info.fs.pixel_center_integer) {
-                       info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] =
-                               TGSI_FS_COORD_PIXEL_CENTER_INTEGER;
-               }
-
-               if (nir->info.fs.depth_layout != FRAG_DEPTH_LAYOUT_NONE) {
-                       switch (nir->info.fs.depth_layout) {
-                       case FRAG_DEPTH_LAYOUT_ANY:
-                               info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_ANY;
-                               break;
-                       case FRAG_DEPTH_LAYOUT_GREATER:
-                               info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_GREATER;
-                               break;
-                       case FRAG_DEPTH_LAYOUT_LESS:
-                               info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_LESS;
-                               break;
-                       case FRAG_DEPTH_LAYOUT_UNCHANGED:
-                               info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_UNCHANGED;
-                               break;
-                       default:
-                               unreachable("Unknow depth layout");
-                       }
-               }
-       }
-
-       if (gl_shader_stage_is_compute(nir->info.stage)) {
-               info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] = nir->info.cs.local_size[0];
-               info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] = nir->info.cs.local_size[1];
-               info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] = nir->info.cs.local_size[2];
-               info->properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD] = nir->info.cs.user_data_components_amd;
-       }
-
-       i = 0;
-       uint64_t processed_inputs = 0;
-       nir_foreach_variable(variable, &nir->inputs) {
-               unsigned semantic_name, semantic_index;
-
-               const struct glsl_type *type = variable->type;
-               if (nir_is_per_vertex_io(variable, nir->info.stage)) {
-                       assert(glsl_type_is_array(type));
-                       type = glsl_get_array_element(type);
-               }
-
-               unsigned attrib_count = glsl_count_attribute_slots(type,
-                                                                  nir->info.stage == MESA_SHADER_VERTEX);
-
-               i = variable->data.driver_location;
-
-               /* Vertex shader inputs don't have semantics. The state
-                * tracker has already mapped them to attributes via
-                * variable->data.driver_location.
-                */
-               if (nir->info.stage == MESA_SHADER_VERTEX)
-                       continue;
-
-               for (unsigned j = 0; j < attrib_count; j++, i++) {
-
-                       if (processed_inputs & ((uint64_t)1 << i))
-                               continue;
-
-                       processed_inputs |= ((uint64_t)1 << i);
-
-                       tgsi_get_gl_varying_semantic(variable->data.location + j, true,
-                                                    &semantic_name, &semantic_index);
-
-                       info->input_semantic_name[i] = semantic_name;
-                       info->input_semantic_index[i] = semantic_index;
-
-                       if (semantic_name == TGSI_SEMANTIC_PRIMID)
-                               info->uses_primid = true;
-
-                       if (semantic_name == TGSI_SEMANTIC_COLOR) {
-                               /* We only need this for color inputs. */
-                               if (variable->data.sample)
-                                       info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_SAMPLE;
-                               else if (variable->data.centroid)
-                                       info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTROID;
-                               else
-                                       info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTER;
-                       }
-
-                        enum glsl_base_type base_type =
-                                glsl_get_base_type(glsl_without_array(variable->type));
-
-                        switch (variable->data.interpolation) {
-                        case INTERP_MODE_NONE:
-                                if (glsl_base_type_is_integer(base_type)) {
-                                        info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
-                                        break;
-                                }
-
-                                if (semantic_name == TGSI_SEMANTIC_COLOR) {
-                                        info->input_interpolate[i] = TGSI_INTERPOLATE_COLOR;
-                                        break;
-                                }
-                                /* fall-through */
-
-                        case INTERP_MODE_SMOOTH:
-                                assert(!glsl_base_type_is_integer(base_type));
-
-                                info->input_interpolate[i] = TGSI_INTERPOLATE_PERSPECTIVE;
-                                break;
-
-                        case INTERP_MODE_NOPERSPECTIVE:
-                                assert(!glsl_base_type_is_integer(base_type));
-
-                                info->input_interpolate[i] = TGSI_INTERPOLATE_LINEAR;
-                                break;
-
-                        case INTERP_MODE_FLAT:
-                                info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
-                                break;
-                        }
-               }
-       }
-
-       nir_foreach_variable(variable, &nir->outputs) {
-               const struct glsl_type *type = variable->type;
-               if (nir_is_per_vertex_io(variable, nir->info.stage)) {
-                       assert(glsl_type_is_array(type));
-                       type = glsl_get_array_element(type);
-               }
-
-               ASSERTED unsigned attrib_count = glsl_count_attribute_slots(type, false);
-               scan_output_helper(variable, 0, type, info);
-
-               unsigned loc = variable->data.location;
-               if (nir->info.stage == MESA_SHADER_FRAGMENT &&
-                   loc == FRAG_RESULT_COLOR &&
-                   nir->info.outputs_written & (1ull << loc)) {
-                       assert(attrib_count == 1);
-                       info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] = true;
-               }
-       }
-
-       info->num_inputs = nir->num_inputs;
-       info->num_outputs = nir->num_outputs;
-
-       info->constbuf0_num_slots = nir->num_uniforms;
-       info->shader_buffers_declared = u_bit_consecutive(0, nir->info.num_ssbos);
-       info->const_buffers_declared = u_bit_consecutive(1, nir->info.num_ubos);
-       if (nir->num_uniforms > 0)
-               info->const_buffers_declared |= 1;
-       info->images_declared = u_bit_consecutive(0, nir->info.num_images);
-       info->msaa_images_declared = u_bit_consecutive(0, nir->info.last_msaa_image + 1);
-       info->samplers_declared = nir->info.textures_used;
-
-       info->num_written_clipdistance = nir->info.clip_distance_array_size;
-       info->num_written_culldistance = nir->info.cull_distance_array_size;
-       info->clipdist_writemask = u_bit_consecutive(0, info->num_written_clipdistance);
-       info->culldist_writemask = u_bit_consecutive(0, info->num_written_culldistance);
-
-       if (info->processor == PIPE_SHADER_FRAGMENT)
-               info->uses_kill = nir->info.fs.uses_discard;
-
-       if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
-               info->tessfactors_are_def_in_all_invocs =
-                       ac_are_tessfactors_def_in_all_invocs(nir);
-       }
-
-       func = (struct nir_function *)exec_list_get_head_const(&nir->functions);
-       nir_foreach_block(block, func->impl) {
-               nir_foreach_instr(instr, block)
-                       scan_instruction(nir, info, instr);
-       }
+   nir_function *func;
+   unsigned i;
+
+   info->processor = pipe_shader_type_from_mesa(nir->info.stage);
+
+   info->properties[TGSI_PROPERTY_NEXT_SHADER] = pipe_shader_type_from_mesa(nir->info.next_stage);
+
+   if (nir->info.stage == MESA_SHADER_VERTEX) {
+      info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] = nir->info.vs.window_space_position;
+      info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] = nir->info.vs.blit_sgprs_amd;
+   }
+
+   if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+      info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT] = nir->info.tess.tcs_vertices_out;
+   }
+
+   if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
+      if (nir->info.tess.primitive_mode == GL_ISOLINES)
+         info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = PIPE_PRIM_LINES;
+      else
+         info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = nir->info.tess.primitive_mode;
+
+      STATIC_ASSERT((TESS_SPACING_EQUAL + 1) % 3 == PIPE_TESS_SPACING_EQUAL);
+      STATIC_ASSERT((TESS_SPACING_FRACTIONAL_ODD + 1) % 3 == PIPE_TESS_SPACING_FRACTIONAL_ODD);
+      STATIC_ASSERT((TESS_SPACING_FRACTIONAL_EVEN + 1) % 3 == PIPE_TESS_SPACING_FRACTIONAL_EVEN);
+
+      info->properties[TGSI_PROPERTY_TES_SPACING] = (nir->info.tess.spacing + 1) % 3;
+      info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW] = !nir->info.tess.ccw;
+      info->properties[TGSI_PROPERTY_TES_POINT_MODE] = nir->info.tess.point_mode;
+   }
+
+   if (nir->info.stage == MESA_SHADER_GEOMETRY) {
+      info->properties[TGSI_PROPERTY_GS_INPUT_PRIM] = nir->info.gs.input_primitive;
+      info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM] = nir->info.gs.output_primitive;
+      info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES] = nir->info.gs.vertices_out;
+      info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = nir->info.gs.invocations;
+   }
+
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] =
+         nir->info.fs.early_fragment_tests | nir->info.fs.post_depth_coverage;
+      info->properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE] = nir->info.fs.post_depth_coverage;
+
+      if (nir->info.fs.pixel_center_integer) {
+         info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] = TGSI_FS_COORD_PIXEL_CENTER_INTEGER;
+      }
+
+      if (nir->info.fs.depth_layout != FRAG_DEPTH_LAYOUT_NONE) {
+         switch (nir->info.fs.depth_layout) {
+         case FRAG_DEPTH_LAYOUT_ANY:
+            info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_ANY;
+            break;
+         case FRAG_DEPTH_LAYOUT_GREATER:
+            info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_GREATER;
+            break;
+         case FRAG_DEPTH_LAYOUT_LESS:
+            info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_LESS;
+            break;
+         case FRAG_DEPTH_LAYOUT_UNCHANGED:
+            info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_UNCHANGED;
+            break;
+         default:
+            unreachable("Unknow depth layout");
+         }
+      }
+   }
+
+   if (gl_shader_stage_is_compute(nir->info.stage)) {
+      info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] = nir->info.cs.local_size[0];
+      info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] = nir->info.cs.local_size[1];
+      info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] = nir->info.cs.local_size[2];
+      info->properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD] =
+         nir->info.cs.user_data_components_amd;
+   }
+
+   i = 0;
+   uint64_t processed_inputs = 0;
+   nir_foreach_variable (variable, &nir->inputs) {
+      unsigned semantic_name, semantic_index;
+
+      const struct glsl_type *type = variable->type;
+      if (nir_is_per_vertex_io(variable, nir->info.stage)) {
+         assert(glsl_type_is_array(type));
+         type = glsl_get_array_element(type);
+      }
+
+      unsigned attrib_count =
+         glsl_count_attribute_slots(type, nir->info.stage == MESA_SHADER_VERTEX);
+
+      i = variable->data.driver_location;
+
+      /* Vertex shader inputs don't have semantics. The state
+       * tracker has already mapped them to attributes via
+       * variable->data.driver_location.
+       */
+      if (nir->info.stage == MESA_SHADER_VERTEX)
+         continue;
+
+      for (unsigned j = 0; j < attrib_count; j++, i++) {
+
+         if (processed_inputs & ((uint64_t)1 << i))
+            continue;
+
+         processed_inputs |= ((uint64_t)1 << i);
+
+         tgsi_get_gl_varying_semantic(variable->data.location + j, true, &semantic_name,
+                                      &semantic_index);
+
+         info->input_semantic_name[i] = semantic_name;
+         info->input_semantic_index[i] = semantic_index;
+
+         if (semantic_name == TGSI_SEMANTIC_PRIMID)
+            info->uses_primid = true;
+
+         if (semantic_name == TGSI_SEMANTIC_COLOR) {
+            /* We only need this for color inputs. */
+            if (variable->data.sample)
+               info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_SAMPLE;
+            else if (variable->data.centroid)
+               info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTROID;
+            else
+               info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTER;
+         }
+
+         enum glsl_base_type base_type = glsl_get_base_type(glsl_without_array(variable->type));
+
+         switch (variable->data.interpolation) {
+         case INTERP_MODE_NONE:
+            if (glsl_base_type_is_integer(base_type)) {
+               info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
+               break;
+            }
+
+            if (semantic_name == TGSI_SEMANTIC_COLOR) {
+               info->input_interpolate[i] = TGSI_INTERPOLATE_COLOR;
+               break;
+            }
+            /* fall-through */
+
+         case INTERP_MODE_SMOOTH:
+            assert(!glsl_base_type_is_integer(base_type));
+
+            info->input_interpolate[i] = TGSI_INTERPOLATE_PERSPECTIVE;
+            break;
+
+         case INTERP_MODE_NOPERSPECTIVE:
+            assert(!glsl_base_type_is_integer(base_type));
+
+            info->input_interpolate[i] = TGSI_INTERPOLATE_LINEAR;
+            break;
+
+         case INTERP_MODE_FLAT:
+            info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
+            break;
+         }
+      }
+   }
+
+   nir_foreach_variable (variable, &nir->outputs) {
+      const struct glsl_type *type = variable->type;
+      if (nir_is_per_vertex_io(variable, nir->info.stage)) {
+         assert(glsl_type_is_array(type));
+         type = glsl_get_array_element(type);
+      }
+
+      ASSERTED unsigned attrib_count = glsl_count_attribute_slots(type, false);
+      scan_output_helper(variable, 0, type, info);
+
+      unsigned loc = variable->data.location;
+      if (nir->info.stage == MESA_SHADER_FRAGMENT && loc == FRAG_RESULT_COLOR &&
+          nir->info.outputs_written & (1ull << loc)) {
+         assert(attrib_count == 1);
+         info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] = true;
+      }
+   }
+
+   info->num_inputs = nir->num_inputs;
+   info->num_outputs = nir->num_outputs;
+
+   info->constbuf0_num_slots = nir->num_uniforms;
+   info->shader_buffers_declared = u_bit_consecutive(0, nir->info.num_ssbos);
+   info->const_buffers_declared = u_bit_consecutive(1, nir->info.num_ubos);
+   if (nir->num_uniforms > 0)
+      info->const_buffers_declared |= 1;
+   info->images_declared = u_bit_consecutive(0, nir->info.num_images);
+   info->msaa_images_declared = u_bit_consecutive(0, nir->info.last_msaa_image + 1);
+   info->samplers_declared = nir->info.textures_used;
+
+   info->num_written_clipdistance = nir->info.clip_distance_array_size;
+   info->num_written_culldistance = nir->info.cull_distance_array_size;
+   info->clipdist_writemask = u_bit_consecutive(0, info->num_written_clipdistance);
+   info->culldist_writemask = u_bit_consecutive(0, info->num_written_culldistance);
+
+   if (info->processor == PIPE_SHADER_FRAGMENT)
+      info->uses_kill = nir->info.fs.uses_discard;
+
+   if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+      info->tessfactors_are_def_in_all_invocs = ac_are_tessfactors_def_in_all_invocs(nir);
+   }
+
+   func = (struct nir_function *)exec_list_get_head_const(&nir->functions);
+   nir_foreach_block (block, func->impl) {
+      nir_foreach_instr (instr, block)
+         scan_instruction(nir, info, instr);
+   }
  }
  
-static void
-si_nir_opts(struct nir_shader *nir)
+static void si_nir_opts(struct nir_shader *nir)
  {
-       bool progress;
-
-       do {
-               progress = false;
-
-               NIR_PASS_V(nir, nir_lower_vars_to_ssa);
-
-               NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
-               NIR_PASS(progress, nir, nir_opt_dead_write_vars);
-
-               NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
-               NIR_PASS_V(nir, nir_lower_phis_to_scalar);
-
-               /* (Constant) copy propagation is needed for txf with offsets. */
-               NIR_PASS(progress, nir, nir_copy_prop);
-               NIR_PASS(progress, nir, nir_opt_remove_phis);
-               NIR_PASS(progress, nir, nir_opt_dce);
-               if (nir_opt_trivial_continues(nir)) {
-                       progress = true;
-                       NIR_PASS(progress, nir, nir_copy_prop);
-                       NIR_PASS(progress, nir, nir_opt_dce);
-               }
-               NIR_PASS(progress, nir, nir_opt_if, true);
-               NIR_PASS(progress, nir, nir_opt_dead_cf);
-               NIR_PASS(progress, nir, nir_opt_cse);
-               NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
-
-               /* Needed for algebraic lowering */
-               NIR_PASS(progress, nir, nir_opt_algebraic);
-               NIR_PASS(progress, nir, nir_opt_constant_folding);
-
-               if (!nir->info.flrp_lowered) {
-                       unsigned lower_flrp =
-                               (nir->options->lower_flrp16 ? 16 : 0) |
-                               (nir->options->lower_flrp32 ? 32 : 0) |
-                               (nir->options->lower_flrp64 ? 64 : 0);
-                       assert(lower_flrp);
-                       bool lower_flrp_progress = false;
-
-                       NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp,
-                                lower_flrp,
-                                false /* always_precise */,
-                                nir->options->lower_ffma);
-                       if (lower_flrp_progress) {
-                               NIR_PASS(progress, nir,
-                                        nir_opt_constant_folding);
-                               progress = true;
-                       }
-
-                       /* Nothing should rematerialize any flrps, so we only
-                        * need to do this lowering once.
-                        */
-                       nir->info.flrp_lowered = true;
-               }
-
-               NIR_PASS(progress, nir, nir_opt_undef);
-               NIR_PASS(progress, nir, nir_opt_conditional_discard);
-               if (nir->options->max_unroll_iterations) {
-                       NIR_PASS(progress, nir, nir_opt_loop_unroll, 0);
-               }
-       } while (progress);
+   bool progress;
+
+   do {
+      progress = false;
+
+      NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+
+      NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+      NIR_PASS(progress, nir, nir_opt_dead_write_vars);
+
+      NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
+      NIR_PASS_V(nir, nir_lower_phis_to_scalar);
+
+      /* (Constant) copy propagation is needed for txf with offsets. */
+      NIR_PASS(progress, nir, nir_copy_prop);
+      NIR_PASS(progress, nir, nir_opt_remove_phis);
+      NIR_PASS(progress, nir, nir_opt_dce);
+      if (nir_opt_trivial_continues(nir)) {
+         progress = true;
+         NIR_PASS(progress, nir, nir_copy_prop);
+         NIR_PASS(progress, nir, nir_opt_dce);
+      }
+      NIR_PASS(progress, nir, nir_opt_if, true);
+      NIR_PASS(progress, nir, nir_opt_dead_cf);
+      NIR_PASS(progress, nir, nir_opt_cse);
+      NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
+
+      /* Needed for algebraic lowering */
+      NIR_PASS(progress, nir, nir_opt_algebraic);
+      NIR_PASS(progress, nir, nir_opt_constant_folding);
+
+      if (!nir->info.flrp_lowered) {
+         unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) |
+                               (nir->options->lower_flrp32 ? 32 : 0) |
+                               (nir->options->lower_flrp64 ? 64 : 0);
+         assert(lower_flrp);
+         bool lower_flrp_progress = false;
+
+         NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */,
+                  nir->options->lower_ffma);
+         if (lower_flrp_progress) {
+            NIR_PASS(progress, nir, nir_opt_constant_folding);
+            progress = true;
+         }
+
+         /* Nothing should rematerialize any flrps, so we only
+          * need to do this lowering once.
+          */
+         nir->info.flrp_lowered = true;
+      }
+
+      NIR_PASS(progress, nir, nir_opt_undef);
+      NIR_PASS(progress, nir, nir_opt_conditional_discard);
+      if (nir->options->max_unroll_iterations) {
+         NIR_PASS(progress, nir, nir_opt_loop_unroll, 0);
+      }
+   } while (progress);
  }
  
-static int
-type_size_vec4(const struct glsl_type *type, bool bindless)
+static int type_size_vec4(const struct glsl_type *type, bool bindless)
  {
-       return glsl_count_attribute_slots(type, false);
+   return glsl_count_attribute_slots(type, false);
  }
  
-static void
-si_nir_lower_color(nir_shader *nir)
+static void si_nir_lower_color(nir_shader *nir)
  {
-        nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
-
-        nir_builder b;
-        nir_builder_init(&b, entrypoint);
-
-        nir_foreach_block(block, entrypoint) {
-                nir_foreach_instr_safe(instr, block) {
-                        if (instr->type != nir_instr_type_intrinsic)
-                                continue;
-
-                        nir_intrinsic_instr *intrin =
-                                nir_instr_as_intrinsic(instr);
-
-                        if (intrin->intrinsic != nir_intrinsic_load_deref)
-                                continue;
-
-                        nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
-                        if (deref->mode != nir_var_shader_in)
-                                continue;
-
-                        b.cursor = nir_before_instr(instr);
-                        nir_variable *var = nir_deref_instr_get_variable(deref);
-                        nir_ssa_def *def;
-
-                        if (var->data.location == VARYING_SLOT_COL0) {
-                                def = nir_load_color0(&b);
-                        } else if (var->data.location == VARYING_SLOT_COL1) {
-                                def = nir_load_color1(&b);
-                        } else {
-                                continue;
-                        }
-
-                        nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(def));
-                        nir_instr_remove(instr);
-                }
-        }
+   nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
+
+   nir_builder b;
+   nir_builder_init(&b, entrypoint);
+
+   nir_foreach_block (block, entrypoint) {
+      nir_foreach_instr_safe (instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+         if (intrin->intrinsic != nir_intrinsic_load_deref)
+            continue;
+
+         nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+         if (deref->mode != nir_var_shader_in)
+            continue;
+
+         b.cursor = nir_before_instr(instr);
+         nir_variable *var = nir_deref_instr_get_variable(deref);
+         nir_ssa_def *def;
+
+         if (var->data.location == VARYING_SLOT_COL0) {
+            def = nir_load_color0(&b);
+         } else if (var->data.location == VARYING_SLOT_COL1) {
+            def = nir_load_color1(&b);
+         } else {
+            continue;
+         }
+
+         nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(def));
+         nir_instr_remove(instr);
+      }
+   }
  }
  
  static void si_nir_lower_ps_inputs(struct nir_shader *nir)
  {
-       if (nir->info.stage != MESA_SHADER_FRAGMENT)
-               return;
-
-       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
-                  nir_shader_get_entrypoint(nir), false, true);
-
-       /* Since we're doing nir_lower_io_to_temporaries late, we need
-        * to lower all the copy_deref's introduced by
-        * lower_io_to_temporaries before calling nir_lower_io.
-        */
-       NIR_PASS_V(nir, nir_split_var_copies);
-       NIR_PASS_V(nir, nir_lower_var_copies);
-       NIR_PASS_V(nir, nir_lower_global_vars_to_local);
-
-       si_nir_lower_color(nir);
-       NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0);
-
-       /* This pass needs actual constants */
-       NIR_PASS_V(nir, nir_opt_constant_folding);
-       NIR_PASS_V(nir, nir_io_add_const_offset_to_base,
-                  nir_var_shader_in);
+   if (nir->info.stage != MESA_SHADER_FRAGMENT)
+      return;
+
+   NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), false, true);
+
+   /* Since we're doing nir_lower_io_to_temporaries late, we need
+    * to lower all the copy_deref's introduced by
+    * lower_io_to_temporaries before calling nir_lower_io.
+    */
+   NIR_PASS_V(nir, nir_split_var_copies);
+   NIR_PASS_V(nir, nir_lower_var_copies);
+   NIR_PASS_V(nir, nir_lower_global_vars_to_local);
+
+   si_nir_lower_color(nir);
+   NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0);
+
+   /* This pass needs actual constants */
+   NIR_PASS_V(nir, nir_opt_constant_folding);
+   NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in);
  }
  
  void si_nir_adjust_driver_locations(struct nir_shader *nir)
  {
-       /* Adjust the driver location of inputs and outputs. The state tracker
-        * interprets them as slots, while the ac/nir backend interprets them
-        * as individual components.
-        */
-       if (nir->info.stage != MESA_SHADER_FRAGMENT) {
-               nir_foreach_variable(variable, &nir->inputs)
-                       variable->data.driver_location *= 4;
-       }
-
-       nir_foreach_variable(variable, &nir->outputs)
-               variable->data.driver_location *= 4;
+   /* Adjust the driver location of inputs and outputs. The state tracker
+    * interprets them as slots, while the ac/nir backend interprets them
+    * as individual components.
+    */
+   if (nir->info.stage != MESA_SHADER_FRAGMENT) {
+      nir_foreach_variable (variable, &nir->inputs)
+         variable->data.driver_location *= 4;
+   }
+
+   nir_foreach_variable (variable, &nir->outputs)
+      variable->data.driver_location *= 4;
  }
  
  /**
@@ -938,65 +889,64 @@ void si_nir_adjust_driver_locations(struct nir_shader *nir)
   */
  static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
  {
-       /* Perform lowerings (and optimizations) of code.
-        *
-        * Performance considerations aside, we must:
-        * - lower certain ALU operations
-        * - ensure constant offsets for texture instructions are folded
-        *   and copy-propagated
-        */
-
-       static const struct nir_lower_tex_options lower_tex_options = {
-               .lower_txp = ~0u,
-       };
-       NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
-
-       const nir_lower_subgroups_options subgroups_options = {
-               .subgroup_size = 64,
-               .ballot_bit_size = 64,
-               .lower_to_scalar = true,
-               .lower_subgroup_masks = true,
-               .lower_vote_trivial = false,
-               .lower_vote_eq_to_ballot = true,
-       };
-       NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options);
-
-       /* Lower load constants to scalar and then clean up the mess */
-       NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
-       NIR_PASS_V(nir, nir_lower_var_copies);
-       NIR_PASS_V(nir, nir_lower_pack);
-       NIR_PASS_V(nir, nir_opt_access);
-       si_nir_opts(nir);
-
-       /* Lower large variables that are always constant with load_constant
-        * intrinsics, which get turned into PC-relative loads from a data
-        * section next to the shader.
-        *
-        * st/mesa calls finalize_nir twice, but we can't call this pass twice.
-        */
-       bool changed = false;
-       if (!nir->constant_data) {
-               NIR_PASS(changed, nir, nir_opt_large_constants,
-                        glsl_get_natural_size_align_bytes, 16);
-       }
-
-       changed |= ac_lower_indirect_derefs(nir, sscreen->info.chip_class);
-       if (changed)
-               si_nir_opts(nir);
-
-       NIR_PASS_V(nir, nir_lower_bool_to_int32);
-       NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp);
-
-       if (sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
-               NIR_PASS_V(nir, nir_lower_discard_to_demote);
+   /* Perform lowerings (and optimizations) of code.
+    *
+    * Performance considerations aside, we must:
+    * - lower certain ALU operations
+    * - ensure constant offsets for texture instructions are folded
+    *   and copy-propagated
+    */
+
+   static const struct nir_lower_tex_options lower_tex_options = {
+      .lower_txp = ~0u,
+   };
+   NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
+
+   const nir_lower_subgroups_options subgroups_options = {
+      .subgroup_size = 64,
+      .ballot_bit_size = 64,
+      .lower_to_scalar = true,
+      .lower_subgroup_masks = true,
+      .lower_vote_trivial = false,
+      .lower_vote_eq_to_ballot = true,
+   };
+   NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options);
+
+   /* Lower load constants to scalar and then clean up the mess */
+   NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
+   NIR_PASS_V(nir, nir_lower_var_copies);
+   NIR_PASS_V(nir, nir_lower_pack);
+   NIR_PASS_V(nir, nir_opt_access);
+   si_nir_opts(nir);
+
+   /* Lower large variables that are always constant with load_constant
+    * intrinsics, which get turned into PC-relative loads from a data
+    * section next to the shader.
+    *
+    * st/mesa calls finalize_nir twice, but we can't call this pass twice.
+    */
+   bool changed = false;
+   if (!nir->constant_data) {
+      NIR_PASS(changed, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
+   }
+
+   changed |= ac_lower_indirect_derefs(nir, sscreen->info.chip_class);
+   if (changed)
+      si_nir_opts(nir);
+
+   NIR_PASS_V(nir, nir_lower_bool_to_int32);
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp);
+
+   if (sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
+      NIR_PASS_V(nir, nir_lower_discard_to_demote);
  }
  
  void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize)
  {
-       struct si_screen *sscreen = (struct si_screen *)screen;
-       struct nir_shader *nir = (struct nir_shader *)nirptr;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct nir_shader *nir = (struct nir_shader *)nirptr;
  
-       nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
-       si_nir_lower_ps_inputs(nir);
-       si_lower_nir(sscreen, nir);
+   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+   si_nir_lower_ps_inputs(nir);
+   si_lower_nir(sscreen, nir);
  }
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c

index 30cca361ac4176b9ca5ddddf87d2dd6fed27be74..e5fd089b59fb2bfc575bdea892a101b67388e4f2 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -26,68 +26,59 @@
  #include "tgsi/tgsi_text.h"
  #include "tgsi/tgsi_ureg.h"
  
-void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
-                       unsigned num_layers)
+void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers)
  {
-       unsigned vs_blit_property;
-       void **vs;
-
-       switch (type) {
-       case UTIL_BLITTER_ATTRIB_NONE:
-               vs = num_layers > 1 ? &sctx->vs_blit_pos_layered :
-                                     &sctx->vs_blit_pos;
-               vs_blit_property = SI_VS_BLIT_SGPRS_POS;
-               break;
-       case UTIL_BLITTER_ATTRIB_COLOR:
-               vs = num_layers > 1 ? &sctx->vs_blit_color_layered :
-                                     &sctx->vs_blit_color;
-               vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR;
-               break;
-       case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
-       case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
-               assert(num_layers == 1);
-               vs = &sctx->vs_blit_texcoord;
-               vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD;
-               break;
-       default:
-               assert(0);
-               return NULL;
-       }
-       if (*vs)
-               return *vs;
-
-       struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
-       if (!ureg)
-               return NULL;
-
-       /* Tell the shader to load VS inputs from SGPRs: */
-       ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS_AMD, vs_blit_property);
-       ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true);
-
-       /* This is just a pass-through shader with 1-3 MOV instructions. */
-       ureg_MOV(ureg,
-                ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0),
-                ureg_DECL_vs_input(ureg, 0));
-
-       if (type != UTIL_BLITTER_ATTRIB_NONE) {
-               ureg_MOV(ureg,
-                        ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0),
-                        ureg_DECL_vs_input(ureg, 1));
-       }
-
-       if (num_layers > 1) {
-               struct ureg_src instance_id =
-                       ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0);
-               struct ureg_dst layer =
-                       ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0);
-
-               ureg_MOV(ureg, ureg_writemask(layer, TGSI_WRITEMASK_X),
-                        ureg_scalar(instance_id, TGSI_SWIZZLE_X));
-       }
-       ureg_END(ureg);
-
-       *vs = ureg_create_shader_and_destroy(ureg, &sctx->b);
-       return *vs;
+   unsigned vs_blit_property;
+   void **vs;
+
+   switch (type) {
+   case UTIL_BLITTER_ATTRIB_NONE:
+      vs = num_layers > 1 ? &sctx->vs_blit_pos_layered : &sctx->vs_blit_pos;
+      vs_blit_property = SI_VS_BLIT_SGPRS_POS;
+      break;
+   case UTIL_BLITTER_ATTRIB_COLOR:
+      vs = num_layers > 1 ? &sctx->vs_blit_color_layered : &sctx->vs_blit_color;
+      vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR;
+      break;
+   case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
+   case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
+      assert(num_layers == 1);
+      vs = &sctx->vs_blit_texcoord;
+      vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD;
+      break;
+   default:
+      assert(0);
+      return NULL;
+   }
+   if (*vs)
+      return *vs;
+
+   struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
+   if (!ureg)
+      return NULL;
+
+   /* Tell the shader to load VS inputs from SGPRs: */
+   ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS_AMD, vs_blit_property);
+   ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true);
+
+   /* This is just a pass-through shader with 1-3 MOV instructions. */
+   ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), ureg_DECL_vs_input(ureg, 0));
+
+   if (type != UTIL_BLITTER_ATTRIB_NONE) {
+      ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0), ureg_DECL_vs_input(ureg, 1));
+   }
+
+   if (num_layers > 1) {
+      struct ureg_src instance_id = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0);
+      struct ureg_dst layer = ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0);
+
+      ureg_MOV(ureg, ureg_writemask(layer, TGSI_WRITEMASK_X),
+               ureg_scalar(instance_id, TGSI_SWIZZLE_X));
+   }
+   ureg_END(ureg);
+
+   *vs = ureg_create_shader_and_destroy(ureg, &sctx->b);
+   return *vs;
  }
  
  /**
@@ -97,137 +88,128 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
   */
  void *si_create_fixed_func_tcs(struct si_context *sctx)
  {
-       struct ureg_src outer, inner;
-       struct ureg_dst tessouter, tessinner;
-       struct ureg_program *ureg = ureg_create(PIPE_SHADER_TESS_CTRL);
+   struct ureg_src outer, inner;
+   struct ureg_dst tessouter, tessinner;
+   struct ureg_program *ureg = ureg_create(PIPE_SHADER_TESS_CTRL);
  
-       if (!ureg)
-               return NULL;
+   if (!ureg)
+      return NULL;
  
-       outer = ureg_DECL_system_value(ureg,
-                                      TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL, 0);
-       inner = ureg_DECL_system_value(ureg,
-                                      TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL, 0);
+   outer = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL, 0);
+   inner = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL, 0);
  
-       tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0);
-       tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0);
+   tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0);
+   tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0);
  
-       ureg_MOV(ureg, tessouter, outer);
-       ureg_MOV(ureg, tessinner, inner);
-       ureg_END(ureg);
+   ureg_MOV(ureg, tessouter, outer);
+   ureg_MOV(ureg, tessinner, inner);
+   ureg_END(ureg);
  
-       return ureg_create_shader_and_destroy(ureg, &sctx->b);
+   return ureg_create_shader_and_destroy(ureg, &sctx->b);
  }
  
  /* Create a compute shader implementing clear_buffer or copy_buffer. */
-void *si_create_dma_compute_shader(struct pipe_context *ctx,
-                                  unsigned num_dwords_per_thread,
-                                  bool dst_stream_cache_policy, bool is_copy)
+void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
+                                   bool dst_stream_cache_policy, bool is_copy)
  {
-       struct si_screen *sscreen = (struct si_screen *)ctx->screen;
-       assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
-
-       unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
-       if (dst_stream_cache_policy)
-               store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
-
-       /* Don't cache loads, because there is no reuse. */
-       unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
-
-       unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
-       unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
-
-       for (unsigned i = 0; i < num_mem_ops; i++) {
-               if (i*4 < num_dwords_per_thread)
-                       inst_dwords[i] = MIN2(4, num_dwords_per_thread - i*4);
-       }
-
-       struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
-       if (!ureg)
-               return NULL;
-
-       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, sscreen->compute_wave_size);
-       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
-       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
-
-       struct ureg_src value;
-       if (!is_copy) {
-               ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]);
-               value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0);
-       }
-
-       struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
-       struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
-       struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
-       struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
-       struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
-       struct ureg_src srcbuf;
-       struct ureg_src *values = NULL;
-
-       if (is_copy) {
-               srcbuf = ureg_DECL_buffer(ureg, 1, false);
-               values = malloc(num_mem_ops * sizeof(struct ureg_src));
-       }
-
-       /* If there are multiple stores, the first store writes into 0*wavesize+tid,
-        * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc.
-        */
-       ureg_UMAD(ureg, store_addr, blk,
-                 ureg_imm1u(ureg, sscreen->compute_wave_size * num_mem_ops), tid);
-       /* Convert from a "store size unit" into bytes. */
-       ureg_UMUL(ureg, store_addr, ureg_src(store_addr),
-                 ureg_imm1u(ureg, 4 * inst_dwords[0]));
-       ureg_MOV(ureg, load_addr, ureg_src(store_addr));
-
-       /* Distance between a load and a store for latency hiding. */
-       unsigned load_store_distance = is_copy ? 8 : 0;
-
-       for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
-               int d = i - load_store_distance;
-
-               if (is_copy && i < num_mem_ops) {
-                       if (i) {
-                               ureg_UADD(ureg, load_addr, ureg_src(load_addr),
-                                         ureg_imm1u(ureg, 4 * inst_dwords[i] *
-                                                    sscreen->compute_wave_size));
-                       }
-
-                       values[i] = ureg_src(ureg_DECL_temporary(ureg));
-                       struct ureg_dst dst =
-                               ureg_writemask(ureg_dst(values[i]),
-                                              u_bit_consecutive(0, inst_dwords[i]));
-                       struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
-                       ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2,
-                                        load_qualifier, TGSI_TEXTURE_BUFFER, 0);
-               }
-
-               if (d >= 0) {
-                       if (d) {
-                               ureg_UADD(ureg, store_addr, ureg_src(store_addr),
-                                         ureg_imm1u(ureg, 4 * inst_dwords[d] *
-                                                    sscreen->compute_wave_size));
-                       }
-
-                       struct ureg_dst dst =
-                               ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
-                       struct ureg_src srcs[] =
-                               {ureg_src(store_addr), is_copy ? values[d] : value};
-                       ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2,
-                                        store_qualifier, TGSI_TEXTURE_BUFFER, 0);
-               }
-       }
-       ureg_END(ureg);
-
-       struct pipe_compute_state state = {};
-       state.ir_type = PIPE_SHADER_IR_TGSI;
-       state.prog = ureg_get_tokens(ureg, NULL);
-
-       void *cs = ctx->create_compute_state(ctx, &state);
-       ureg_destroy(ureg);
-        ureg_free_tokens(state.prog);
-
-       free(values);
-       return cs;
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+   assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
+
+   unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
+   if (dst_stream_cache_policy)
+      store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+   /* Don't cache loads, because there is no reuse. */
+   unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+   unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
+   unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
+
+   for (unsigned i = 0; i < num_mem_ops; i++) {
+      if (i * 4 < num_dwords_per_thread)
+         inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
+   }
+
+   struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+   if (!ureg)
+      return NULL;
+
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, sscreen->compute_wave_size);
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+   struct ureg_src value;
+   if (!is_copy) {
+      ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]);
+      value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0);
+   }
+
+   struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
+   struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
+   struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+   struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+   struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
+   struct ureg_src srcbuf;
+   struct ureg_src *values = NULL;
+
+   if (is_copy) {
+      srcbuf = ureg_DECL_buffer(ureg, 1, false);
+      values = malloc(num_mem_ops * sizeof(struct ureg_src));
+   }
+
+   /* If there are multiple stores, the first store writes into 0*wavesize+tid,
+    * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc.
+    */
+   ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, sscreen->compute_wave_size * num_mem_ops),
+             tid);
+   /* Convert from a "store size unit" into bytes. */
+   ureg_UMUL(ureg, store_addr, ureg_src(store_addr), ureg_imm1u(ureg, 4 * inst_dwords[0]));
+   ureg_MOV(ureg, load_addr, ureg_src(store_addr));
+
+   /* Distance between a load and a store for latency hiding. */
+   unsigned load_store_distance = is_copy ? 8 : 0;
+
+   for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
+      int d = i - load_store_distance;
+
+      if (is_copy && i < num_mem_ops) {
+         if (i) {
+            ureg_UADD(ureg, load_addr, ureg_src(load_addr),
+                      ureg_imm1u(ureg, 4 * inst_dwords[i] * sscreen->compute_wave_size));
+         }
+
+         values[i] = ureg_src(ureg_DECL_temporary(ureg));
+         struct ureg_dst dst =
+            ureg_writemask(ureg_dst(values[i]), u_bit_consecutive(0, inst_dwords[i]));
+         struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
+         ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, load_qualifier,
+                          TGSI_TEXTURE_BUFFER, 0);
+      }
+
+      if (d >= 0) {
+         if (d) {
+            ureg_UADD(ureg, store_addr, ureg_src(store_addr),
+                      ureg_imm1u(ureg, 4 * inst_dwords[d] * sscreen->compute_wave_size));
+         }
+
+         struct ureg_dst dst = ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
+         struct ureg_src srcs[] = {ureg_src(store_addr), is_copy ? values[d] : value};
+         ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, store_qualifier,
+                          TGSI_TEXTURE_BUFFER, 0);
+      }
+   }
+   ureg_END(ureg);
+
+   struct pipe_compute_state state = {};
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = ureg_get_tokens(ureg, NULL);
+
+   void *cs = ctx->create_compute_state(ctx, &state);
+   ureg_destroy(ureg);
+   ureg_free_tokens(state.prog);
+
+   free(values);
+   return cs;
  }
  
  /* Create a compute shader that copies DCC from one buffer to another
@@ -240,67 +222,63 @@ void *si_create_dma_compute_shader(struct pipe_context *ctx,
   */
  void *si_create_dcc_retile_cs(struct pipe_context *ctx)
  {
-       struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
-       if (!ureg)
-               return NULL;
-
-       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64);
-       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
-       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
-
-       /* Compute the global thread ID (in idx). */
-       struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
-       struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
-       struct ureg_dst idx = ureg_writemask(ureg_DECL_temporary(ureg),
-                                            TGSI_WRITEMASK_X);
-       ureg_UMAD(ureg, idx, blk, ureg_imm1u(ureg, 64), tid);
-
-       /* Load 2 pairs of offsets for DCC load & store. */
-       struct ureg_src map = ureg_DECL_image(ureg, 0, TGSI_TEXTURE_BUFFER, 0, false, false);
-       struct ureg_dst offsets = ureg_DECL_temporary(ureg);
-       struct ureg_src map_load_args[] = {map, ureg_src(idx)};
-
-       ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &offsets, 1, map_load_args, 2,
-                        TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0);
-
-       struct ureg_src dcc_src = ureg_DECL_image(ureg, 1, TGSI_TEXTURE_BUFFER,
-                                                 0, false, false);
-       struct ureg_dst dcc_dst = ureg_dst(ureg_DECL_image(ureg, 2, TGSI_TEXTURE_BUFFER,
-                                                          0, true, false));
-       struct ureg_dst dcc_value[2];
-
-       /* Copy DCC values:
-        *   dst[offsets.y] = src[offsets.x];
-        *   dst[offsets.w] = src[offsets.z];
-        */
-       for (unsigned i = 0; i < 2; i++) {
-               dcc_value[i] = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
-
-               struct ureg_src load_args[] =
-                       {dcc_src, ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_X + i*2)};
-               ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dcc_value[i], 1, load_args, 2,
-                                TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0);
-       }
-
-       dcc_dst = ureg_writemask(dcc_dst, TGSI_WRITEMASK_X);
-
-       for (unsigned i = 0; i < 2; i++) {
-               struct ureg_src store_args[] = {
-                       ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_Y + i*2),
-                       ureg_src(dcc_value[i])
-               };
-               ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dcc_dst, 1, store_args, 2,
-                                TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0);
-       }
-       ureg_END(ureg);
-
-       struct pipe_compute_state state = {};
-       state.ir_type = PIPE_SHADER_IR_TGSI;
-       state.prog = ureg_get_tokens(ureg, NULL);
-
-       void *cs = ctx->create_compute_state(ctx, &state);
-       ureg_destroy(ureg);
-       return cs;
+   struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+   if (!ureg)
+      return NULL;
+
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64);
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+   /* Compute the global thread ID (in idx). */
+   struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
+   struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
+   struct ureg_dst idx = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+   ureg_UMAD(ureg, idx, blk, ureg_imm1u(ureg, 64), tid);
+
+   /* Load 2 pairs of offsets for DCC load & store. */
+   struct ureg_src map = ureg_DECL_image(ureg, 0, TGSI_TEXTURE_BUFFER, 0, false, false);
+   struct ureg_dst offsets = ureg_DECL_temporary(ureg);
+   struct ureg_src map_load_args[] = {map, ureg_src(idx)};
+
+   ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &offsets, 1, map_load_args, 2, TGSI_MEMORY_RESTRICT,
+                    TGSI_TEXTURE_BUFFER, 0);
+
+   struct ureg_src dcc_src = ureg_DECL_image(ureg, 1, TGSI_TEXTURE_BUFFER, 0, false, false);
+   struct ureg_dst dcc_dst =
+      ureg_dst(ureg_DECL_image(ureg, 2, TGSI_TEXTURE_BUFFER, 0, true, false));
+   struct ureg_dst dcc_value[2];
+
+   /* Copy DCC values:
+    *   dst[offsets.y] = src[offsets.x];
+    *   dst[offsets.w] = src[offsets.z];
+    */
+   for (unsigned i = 0; i < 2; i++) {
+      dcc_value[i] = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+
+      struct ureg_src load_args[] = {dcc_src,
+                                     ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_X + i * 2)};
+      ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dcc_value[i], 1, load_args, 2, TGSI_MEMORY_RESTRICT,
+                       TGSI_TEXTURE_BUFFER, 0);
+   }
+
+   dcc_dst = ureg_writemask(dcc_dst, TGSI_WRITEMASK_X);
+
+   for (unsigned i = 0; i < 2; i++) {
+      struct ureg_src store_args[] = {ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_Y + i * 2),
+                                      ureg_src(dcc_value[i])};
+      ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dcc_dst, 1, store_args, 2, TGSI_MEMORY_RESTRICT,
+                       TGSI_TEXTURE_BUFFER, 0);
+   }
+   ureg_END(ureg);
+
+   struct pipe_compute_state state = {};
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = ureg_get_tokens(ureg, NULL);
+
+   void *cs = ctx->create_compute_state(ctx, &state);
+   ureg_destroy(ureg);
+   return cs;
  }
  
  /* Create the compute shader that is used to collect the results.
@@ -337,186 +315,185 @@ void *si_create_dcc_retile_cs(struct pipe_context *ctx)
   */
  void *si_create_query_result_cs(struct si_context *sctx)
  {
-       /* TEMP[0].xy = accumulated result so far
-        * TEMP[0].z = result not available
-        *
-        * TEMP[1].x = current result index
-        * TEMP[1].y = current pair index
-        */
-       static const char text_tmpl[] =
-               "COMP\n"
-               "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
-               "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
-               "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-               "DCL BUFFER[0]\n"
-               "DCL BUFFER[1]\n"
-               "DCL BUFFER[2]\n"
-               "DCL CONST[0][0..1]\n"
-               "DCL TEMP[0..5]\n"
-               "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
-               "IMM[1] UINT32 {1, 2, 4, 8}\n"
-               "IMM[2] UINT32 {16, 32, 64, 128}\n"
-               "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
-               "IMM[4] UINT32 {256, 0, 0, 0}\n"
-
-               "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
-               "UIF TEMP[5]\n"
-                       /* Check result availability. */
-                       "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
-                       "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
-                       "MOV TEMP[1], TEMP[0].zzzz\n"
-                       "NOT TEMP[0].z, TEMP[0].zzzz\n"
-
-                       /* Load result if available. */
-                       "UIF TEMP[1]\n"
-                               "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
-                       "ENDIF\n"
-               "ELSE\n"
-                       /* Load previously accumulated result if requested. */
-                       "MOV TEMP[0], IMM[0].xxxx\n"
-                       "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
-                       "UIF TEMP[4]\n"
-                               "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
-                       "ENDIF\n"
-
-                       "MOV TEMP[1].x, IMM[0].xxxx\n"
-                       "BGNLOOP\n"
-                               /* Break if accumulated result so far is not available. */
-                               "UIF TEMP[0].zzzz\n"
-                                       "BRK\n"
-                               "ENDIF\n"
-
-                               /* Break if result_index >= result_count. */
-                               "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
-                               "UIF TEMP[5]\n"
-                                       "BRK\n"
-                               "ENDIF\n"
-
-                               /* Load fence and check result availability */
-                               "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
-                               "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
-                               "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
-                               "NOT TEMP[0].z, TEMP[0].zzzz\n"
-                               "UIF TEMP[0].zzzz\n"
-                                       "BRK\n"
-                               "ENDIF\n"
-
-                               "MOV TEMP[1].y, IMM[0].xxxx\n"
-                               "BGNLOOP\n"
-                                       /* Load start and end. */
-                                       "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
-                                       "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
-                                       "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
-
-                                       "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
-                                       "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
-
-                                       "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
-
-                                       "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
-                                       "UIF TEMP[5].zzzz\n"
-                                               /* Load second start/end half-pair and
-                                                * take the difference
-                                                */
-                                               "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
-                                               "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
-                                               "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
-
-                                               "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
-                                               "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
-                                       "ENDIF\n"
-
-                                       "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
-
-                                       /* Increment pair index */
-                                       "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
-                                       "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
-                                       "UIF TEMP[5]\n"
-                                               "BRK\n"
-                                       "ENDIF\n"
-                               "ENDLOOP\n"
-
-                               /* Increment result index */
-                               "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
-                       "ENDLOOP\n"
-               "ENDIF\n"
-
-               "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
-               "UIF TEMP[4]\n"
-                       /* Store accumulated data for chaining. */
-                       "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
-               "ELSE\n"
-                       "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
-                       "UIF TEMP[4]\n"
-                               /* Store result availability. */
-                               "NOT TEMP[0].z, TEMP[0]\n"
-                               "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
-                               "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
-
-                               "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
-                               "UIF TEMP[4]\n"
-                                       "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
-                               "ENDIF\n"
-                       "ELSE\n"
-                               /* Store result if it is available. */
-                               "NOT TEMP[4], TEMP[0].zzzz\n"
-                               "UIF TEMP[4]\n"
-                                       /* Apply timestamp conversion */
-                                       "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
-                                       "UIF TEMP[4]\n"
-                                               "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
-                                               "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
-                                       "ENDIF\n"
-
-                                       /* Convert to boolean */
-                                       "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
-                                       "UIF TEMP[4]\n"
-                                               "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
-                                               "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
-                                               "MOV TEMP[0].y, IMM[0].xxxx\n"
-                                       "ENDIF\n"
-
-                                       "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
-                                       "UIF TEMP[4]\n"
-                                               "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
-                                       "ELSE\n"
-                                               /* Clamping */
-                                               "UIF TEMP[0].yyyy\n"
-                                                       "MOV TEMP[0].x, IMM[0].wwww\n"
-                                               "ENDIF\n"
-
-                                               "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
-                                               "UIF TEMP[4]\n"
-                                                       "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
-                                               "ENDIF\n"
-
-                                               "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
-                                       "ENDIF\n"
-                               "ENDIF\n"
-                       "ENDIF\n"
-               "ENDIF\n"
-
-               "END\n";
-
-       char text[sizeof(text_tmpl) + 32];
-       struct tgsi_token tokens[1024];
-       struct pipe_compute_state state = {};
-
-       /* Hard code the frequency into the shader so that the backend can
-        * use the full range of optimizations for divide-by-constant.
-        */
-       snprintf(text, sizeof(text), text_tmpl,
-                sctx->screen->info.clock_crystal_freq);
-
-       if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
-               assert(false);
-               return NULL;
-       }
-
-       state.ir_type = PIPE_SHADER_IR_TGSI;
-       state.prog = tokens;
-
-       return sctx->b.create_compute_state(&sctx->b, &state);
+   /* TEMP[0].xy = accumulated result so far
+    * TEMP[0].z = result not available
+    *
+    * TEMP[1].x = current result index
+    * TEMP[1].y = current pair index
+    */
+   static const char text_tmpl[] =
+      "COMP\n"
+      "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
+      "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+      "DCL BUFFER[0]\n"
+      "DCL BUFFER[1]\n"
+      "DCL BUFFER[2]\n"
+      "DCL CONST[0][0..1]\n"
+      "DCL TEMP[0..5]\n"
+      "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
+      "IMM[1] UINT32 {1, 2, 4, 8}\n"
+      "IMM[2] UINT32 {16, 32, 64, 128}\n"
+      "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
+      "IMM[4] UINT32 {256, 0, 0, 0}\n"
+
+      "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
+      "UIF TEMP[5]\n"
+      /* Check result availability. */
+      "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
+      "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
+      "MOV TEMP[1], TEMP[0].zzzz\n"
+      "NOT TEMP[0].z, TEMP[0].zzzz\n"
+
+      /* Load result if available. */
+      "UIF TEMP[1]\n"
+      "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
+      "ENDIF\n"
+      "ELSE\n"
+      /* Load previously accumulated result if requested. */
+      "MOV TEMP[0], IMM[0].xxxx\n"
+      "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
+      "UIF TEMP[4]\n"
+      "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
+      "ENDIF\n"
+
+      "MOV TEMP[1].x, IMM[0].xxxx\n"
+      "BGNLOOP\n"
+      /* Break if accumulated result so far is not available. */
+      "UIF TEMP[0].zzzz\n"
+      "BRK\n"
+      "ENDIF\n"
+
+      /* Break if result_index >= result_count. */
+      "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
+      "UIF TEMP[5]\n"
+      "BRK\n"
+      "ENDIF\n"
+
+      /* Load fence and check result availability */
+      "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
+      "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
+      "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
+      "NOT TEMP[0].z, TEMP[0].zzzz\n"
+      "UIF TEMP[0].zzzz\n"
+      "BRK\n"
+      "ENDIF\n"
+
+      "MOV TEMP[1].y, IMM[0].xxxx\n"
+      "BGNLOOP\n"
+      /* Load start and end. */
+      "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
+      "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
+      "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
+
+      "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
+      "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
+
+      "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
+
+      "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
+      "UIF TEMP[5].zzzz\n"
+      /* Load second start/end half-pair and
+       * take the difference
+       */
+      "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
+      "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
+      "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
+
+      "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
+      "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
+      "ENDIF\n"
+
+      "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
+
+      /* Increment pair index */
+      "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
+      "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
+      "UIF TEMP[5]\n"
+      "BRK\n"
+      "ENDIF\n"
+      "ENDLOOP\n"
+
+      /* Increment result index */
+      "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
+      "ENDLOOP\n"
+      "ENDIF\n"
+
+      "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
+      "UIF TEMP[4]\n"
+      /* Store accumulated data for chaining. */
+      "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
+      "ELSE\n"
+      "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
+      "UIF TEMP[4]\n"
+      /* Store result availability. */
+      "NOT TEMP[0].z, TEMP[0]\n"
+      "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
+      "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
+
+      "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
+      "UIF TEMP[4]\n"
+      "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
+      "ENDIF\n"
+      "ELSE\n"
+      /* Store result if it is available. */
+      "NOT TEMP[4], TEMP[0].zzzz\n"
+      "UIF TEMP[4]\n"
+      /* Apply timestamp conversion */
+      "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
+      "UIF TEMP[4]\n"
+      "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
+      "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
+      "ENDIF\n"
+
+      /* Convert to boolean */
+      "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
+      "UIF TEMP[4]\n"
+      "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
+      "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
+      "MOV TEMP[0].y, IMM[0].xxxx\n"
+      "ENDIF\n"
+
+      "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
+      "UIF TEMP[4]\n"
+      "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
+      "ELSE\n"
+      /* Clamping */
+      "UIF TEMP[0].yyyy\n"
+      "MOV TEMP[0].x, IMM[0].wwww\n"
+      "ENDIF\n"
+
+      "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
+      "UIF TEMP[4]\n"
+      "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
+      "ENDIF\n"
+
+      "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+      "ENDIF\n"
+      "ENDIF\n"
+      "ENDIF\n"
+      "ENDIF\n"
+
+      "END\n";
+
+   char text[sizeof(text_tmpl) + 32];
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {};
+
+   /* Hard code the frequency into the shader so that the backend can
+    * use the full range of optimizations for divide-by-constant.
+    */
+   snprintf(text, sizeof(text), text_tmpl, sctx->screen->info.clock_crystal_freq);
+
+   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return sctx->b.create_compute_state(&sctx->b, &state);
  }
  
  /* Create a compute shader implementing copy_image.
@@ -524,247 +501,238 @@ void *si_create_query_result_cs(struct si_context *sctx)
   */
  void *si_create_copy_image_compute_shader(struct pipe_context *ctx)
  {
-       static const char text[] =
-               "COMP\n"
-               "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
-               "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
-               "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-               "DCL SV[0], THREAD_ID\n"
-               "DCL SV[1], BLOCK_ID\n"
-               "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
-               "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
-               "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
-               "DCL TEMP[0..4], LOCAL\n"
-               "IMM[0] UINT32 {8, 1, 0, 0}\n"
-               "MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
-               "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
-               "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
-               "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
-               "MOV TEMP[4].xyz, CONST[0][1].xyzw\n"
-               "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n"
-               "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
-               "END\n";
-
-       struct tgsi_token tokens[1024];
-       struct pipe_compute_state state = {0};
-
-       if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
-               assert(false);
-               return NULL;
-       }
-
-       state.ir_type = PIPE_SHADER_IR_TGSI;
-       state.prog = tokens;
-
-       return ctx->create_compute_state(ctx, &state);
+   static const char text[] =
+      "COMP\n"
+      "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
+      "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
+      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+      "DCL SV[0], THREAD_ID\n"
+      "DCL SV[1], BLOCK_ID\n"
+      "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+      "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+      "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
+      "DCL TEMP[0..4], LOCAL\n"
+      "IMM[0] UINT32 {8, 1, 0, 0}\n"
+      "MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
+      "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
+      "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
+      "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+      "MOV TEMP[4].xyz, CONST[0][1].xyzw\n"
+      "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n"
+      "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+      "END\n";
+
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {0};
+
+   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return ctx->create_compute_state(ctx, &state);
  }
  
  void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx)
  {
-       static const char text[] =
-               "COMP\n"
-               "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
-               "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
-               "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-               "DCL SV[0], THREAD_ID\n"
-               "DCL SV[1], BLOCK_ID\n"
-               "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
-               "DCL IMAGE[1], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
-               "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
-               "DCL TEMP[0..4], LOCAL\n"
-               "IMM[0] UINT32 {64, 1, 0, 0}\n"
-               "MOV TEMP[0].xy, CONST[0][0].xzzw\n"
-               "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
-               "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
-               "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
-               "MOV TEMP[4].xy, CONST[0][1].xzzw\n"
-               "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[4].xyzx\n"
-               "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
-               "END\n";
-
-       struct tgsi_token tokens[1024];
-       struct pipe_compute_state state = {0};
-
-       if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
-               assert(false);
-               return NULL;
-       }
-
-       state.ir_type = PIPE_SHADER_IR_TGSI;
-       state.prog = tokens;
-
-       return ctx->create_compute_state(ctx, &state);
+   static const char text[] =
+      "COMP\n"
+      "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
+      "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+      "DCL SV[0], THREAD_ID\n"
+      "DCL SV[1], BLOCK_ID\n"
+      "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+      "DCL IMAGE[1], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+      "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
+      "DCL TEMP[0..4], LOCAL\n"
+      "IMM[0] UINT32 {64, 1, 0, 0}\n"
+      "MOV TEMP[0].xy, CONST[0][0].xzzw\n"
+      "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
+      "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
+      "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+      "MOV TEMP[4].xy, CONST[0][1].xzzw\n"
+      "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[4].xyzx\n"
+      "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+      "END\n";
+
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {0};
+
+   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return ctx->create_compute_state(ctx, &state);
  }
  
  void *si_clear_render_target_shader(struct pipe_context *ctx)
  {
-       static const char text[] =
-               "COMP\n"
-               "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
-               "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
-               "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-               "DCL SV[0], THREAD_ID\n"
-               "DCL SV[1], BLOCK_ID\n"
-               "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
-               "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
-               "DCL TEMP[0..3], LOCAL\n"
-               "IMM[0] UINT32 {8, 1, 0, 0}\n"
-               "MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
-               "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
-               "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
-               "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
-               "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
-               "END\n";
-
-       struct tgsi_token tokens[1024];
-       struct pipe_compute_state state = {0};
-
-       if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
-               assert(false);
-               return NULL;
-       }
-
-       state.ir_type = PIPE_SHADER_IR_TGSI;
-       state.prog = tokens;
-
-       return ctx->create_compute_state(ctx, &state);
+   static const char text[] =
+      "COMP\n"
+      "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
+      "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
+      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+      "DCL SV[0], THREAD_ID\n"
+      "DCL SV[1], BLOCK_ID\n"
+      "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+      "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
+      "DCL TEMP[0..3], LOCAL\n"
+      "IMM[0] UINT32 {8, 1, 0, 0}\n"
+      "MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
+      "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
+      "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
+      "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
+      "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+      "END\n";
+
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {0};
+
+   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return ctx->create_compute_state(ctx, &state);
  }
  
  /* TODO: Didn't really test 1D_ARRAY */
  void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx)
  {
-       static const char text[] =
-               "COMP\n"
-               "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
-               "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
-               "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-               "DCL SV[0], THREAD_ID\n"
-               "DCL SV[1], BLOCK_ID\n"
-               "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
-               "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
-               "DCL TEMP[0..3], LOCAL\n"
-               "IMM[0] UINT32 {64, 1, 0, 0}\n"
-               "MOV TEMP[0].xy, CONST[0][0].xzzw\n"
-               "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
-               "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
-               "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
-               "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
-               "END\n";
-
-       struct tgsi_token tokens[1024];
-       struct pipe_compute_state state = {0};
-
-       if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
-               assert(false);
-               return NULL;
-       }
-
-       state.ir_type = PIPE_SHADER_IR_TGSI;
-       state.prog = tokens;
-
-       return ctx->create_compute_state(ctx, &state);
+   static const char text[] =
+      "COMP\n"
+      "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
+      "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+      "DCL SV[0], THREAD_ID\n"
+      "DCL SV[1], BLOCK_ID\n"
+      "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+      "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
+      "DCL TEMP[0..3], LOCAL\n"
+      "IMM[0] UINT32 {64, 1, 0, 0}\n"
+      "MOV TEMP[0].xy, CONST[0][0].xzzw\n"
+      "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
+      "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
+      "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
+      "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+      "END\n";
+
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {0};
+
+   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return ctx->create_compute_state(ctx, &state);
  }
  
  void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx)
  {
-       static const char text[] =
-               "COMP\n"
-               "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
-               "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
-               "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-               "DCL SV[0], THREAD_ID\n"
-               "DCL SV[1], BLOCK_ID\n"
-               "DCL BUFFER[0]\n"
-               "DCL CONST[0][0..0]\n" // 0:xyzw
-               "DCL TEMP[0..0]\n"
-               "IMM[0] UINT32 {64, 1, 12, 0}\n"
-               "UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
-               "UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" //12 bytes
-               "STORE BUFFER[0].xyz, TEMP[0].xxxx, CONST[0][0].xyzw\n"
-               "END\n";
-
-       struct tgsi_token tokens[1024];
-       struct pipe_compute_state state = {0};
-
-       if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
-               assert(false);
-               return NULL;
-       }
-
-       state.ir_type = PIPE_SHADER_IR_TGSI;
-       state.prog = tokens;
-
-       return ctx->create_compute_state(ctx, &state);
+   static const char text[] = "COMP\n"
+                              "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
+                              "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+                              "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+                              "DCL SV[0], THREAD_ID\n"
+                              "DCL SV[1], BLOCK_ID\n"
+                              "DCL BUFFER[0]\n"
+                              "DCL CONST[0][0..0]\n" // 0:xyzw
+                              "DCL TEMP[0..0]\n"
+                              "IMM[0] UINT32 {64, 1, 12, 0}\n"
+                              "UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
+                              "UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" // 12 bytes
+                              "STORE BUFFER[0].xyz, TEMP[0].xxxx, CONST[0][0].xyzw\n"
+                              "END\n";
+
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {0};
+
+   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return ctx->create_compute_state(ctx, &state);
  }
  
-
  /* Load samples from the image, and copy them to the same image. This looks like
   * a no-op, but it's not. Loads use FMASK, while stores don't, so samples are
   * reordered to match expanded FMASK.
   *
   * After the shader finishes, FMASK should be cleared to identity.
   */
-void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples,
-                               bool is_array)
+void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, bool is_array)
  {
-       enum tgsi_texture_type target = is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA :
-                                                  TGSI_TEXTURE_2D_MSAA;
-       struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
-       if (!ureg)
-               return NULL;
-
-       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 8);
-       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 8);
-       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
-
-       /* Compute the image coordinates. */
-       struct ureg_src image = ureg_DECL_image(ureg, 0, target, 0, true, false);
-       struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
-       struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
-       struct ureg_dst coord = ureg_writemask(ureg_DECL_temporary(ureg),
-                                              TGSI_WRITEMASK_XYZW);
-       ureg_UMAD(ureg, ureg_writemask(coord, TGSI_WRITEMASK_XY),
-                 ureg_swizzle(blk, 0, 1, 1, 1), ureg_imm2u(ureg, 8, 8),
-                 ureg_swizzle(tid, 0, 1, 1, 1));
-       if (is_array) {
-               ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_Z),
-                        ureg_scalar(blk, TGSI_SWIZZLE_Z));
-       }
-
-       /* Load samples, resolving FMASK. */
-       struct ureg_dst sample[8];
-       assert(num_samples <= ARRAY_SIZE(sample));
-
-       for (unsigned i = 0; i < num_samples; i++) {
-               sample[i] = ureg_DECL_temporary(ureg);
-
-               ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W),
-                        ureg_imm1u(ureg, i));
-
-               struct ureg_src srcs[] = {image, ureg_src(coord)};
-               ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &sample[i], 1, srcs, 2,
-                                TGSI_MEMORY_RESTRICT, target, 0);
-       }
-
-       /* Store samples, ignoring FMASK. */
-       for (unsigned i = 0; i < num_samples; i++) {
-               ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W),
-                        ureg_imm1u(ureg, i));
-
-               struct ureg_dst dst_image = ureg_dst(image);
-               struct ureg_src srcs[] = {ureg_src(coord), ureg_src(sample[i])};
-               ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst_image, 1, srcs, 2,
-                                TGSI_MEMORY_RESTRICT, target, 0);
-       }
-       ureg_END(ureg);
-
-       struct pipe_compute_state state = {};
-       state.ir_type = PIPE_SHADER_IR_TGSI;
-       state.prog = ureg_get_tokens(ureg, NULL);
-
-       void *cs = ctx->create_compute_state(ctx, &state);
-       ureg_destroy(ureg);
-       return cs;
+   enum tgsi_texture_type target = is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
+   struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+   if (!ureg)
+      return NULL;
+
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 8);
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 8);
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+   /* Compute the image coordinates. */
+   struct ureg_src image = ureg_DECL_image(ureg, 0, target, 0, true, false);
+   struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
+   struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
+   struct ureg_dst coord = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZW);
+   ureg_UMAD(ureg, ureg_writemask(coord, TGSI_WRITEMASK_XY), ureg_swizzle(blk, 0, 1, 1, 1),
+             ureg_imm2u(ureg, 8, 8), ureg_swizzle(tid, 0, 1, 1, 1));
+   if (is_array) {
+      ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_Z), ureg_scalar(blk, TGSI_SWIZZLE_Z));
+   }
+
+   /* Load samples, resolving FMASK. */
+   struct ureg_dst sample[8];
+   assert(num_samples <= ARRAY_SIZE(sample));
+
+   for (unsigned i = 0; i < num_samples; i++) {
+      sample[i] = ureg_DECL_temporary(ureg);
+
+      ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), ureg_imm1u(ureg, i));
+
+      struct ureg_src srcs[] = {image, ureg_src(coord)};
+      ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &sample[i], 1, srcs, 2, TGSI_MEMORY_RESTRICT, target,
+                       0);
+   }
+
+   /* Store samples, ignoring FMASK. */
+   for (unsigned i = 0; i < num_samples; i++) {
+      ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), ureg_imm1u(ureg, i));
+
+      struct ureg_dst dst_image = ureg_dst(image);
+      struct ureg_src srcs[] = {ureg_src(coord), ureg_src(sample[i])};
+      ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst_image, 1, srcs, 2, TGSI_MEMORY_RESTRICT,
+                       target, 0);
+   }
+   ureg_END(ureg);
+
+   struct pipe_compute_state state = {};
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = ureg_get_tokens(ureg, NULL);
+
+   void *cs = ctx->create_compute_state(ctx, &state);
+   ureg_destroy(ureg);
+   return cs;
  }
  
  /* Create the compute shader that is used to collect the results of gfx10+
@@ -798,196 +766,192 @@ void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples,
   */
  void *gfx10_create_sh_query_result_cs(struct si_context *sctx)
  {
-       /* TEMP[0].x = accumulated result so far
-        * TEMP[0].y = result missing
-        * TEMP[0].z = whether we're in overflow mode
-        */
-       static const char text_tmpl[] =
-               "COMP\n"
-               "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
-               "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
-               "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-               "DCL BUFFER[0]\n"
-               "DCL BUFFER[1]\n"
-               "DCL BUFFER[2]\n"
-               "DCL CONST[0][0..0]\n"
-               "DCL TEMP[0..5]\n"
-               "IMM[0] UINT32 {0, 7, 0, 4294967295}\n"
-               "IMM[1] UINT32 {1, 2, 4, 8}\n"
-               "IMM[2] UINT32 {16, 32, 64, 128}\n"
-
-               /*
-               acc_result = 0;
-               acc_missing = 0;
-               if (chain & 1) {
-                       acc_result = buffer[1][0];
-                       acc_missing = buffer[1][1];
-               }
-               */
-               "MOV TEMP[0].xy, IMM[0].xxxx\n"
-               "AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n"
-               "UIF TEMP[5]\n"
-                       "LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n"
-               "ENDIF\n"
-
-               /*
-               is_overflow (TEMP[0].z) = (config & 7) >= 2;
-               result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 : result_count;
-               base_offset (TEMP[1].y) = 0;
-               for (;;) {
-                       if (!result_remaining)
-                               break;
-                       result_remaining--;
-               */
-               "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
-               "USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n"
-
-               "AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n"
-               "UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n"
-               "MOV TEMP[1].y, IMM[0].xxxx\n"
-
-               "BGNLOOP\n"
-                       "USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n"
-                       "UIF TEMP[5]\n"
-                               "BRK\n"
-                       "ENDIF\n"
-                       "UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"
-
-                       /*
-                       fence = buffer[0]@(base_offset + 32);
-                       if (!fence) {
-                               acc_missing = ~0u;
-                               break;
-                       }
-                       */
-                       "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n"
-                       "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
-                       "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
-                       "UIF TEMP[5]\n"
-                               "MOV TEMP[0].y, TEMP[5].xxxx\n"
-                               "BRK\n"
-                       "ENDIF\n"
-
-                       /*
-                       stream_offset (TEMP[2].x) = base_offset + offset;
-
-                       if (!(config & 7)) {
-                               acc_result += buffer[0]@stream_offset;
-                       }
-                       */
-                       "UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n"
-
-                       "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
-                       "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
-                       "UIF TEMP[5]\n"
-                               "LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n"
-                               "UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n"
-                       "ENDIF\n"
-
-                       /*
-                       if ((config & 7) >= 2) {
-                               count (TEMP[2].y) = (config & 1) ? 4 : 1;
-                       */
-                       "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
-                       "USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n"
-                       "UIF TEMP[5]\n"
-                               "AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n"
-                               "UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n"
-
-                               /*
-                               do {
-                                       generated = buffer[0]@stream_offset;
-                                       emitted = buffer[0]@(stream_offset + 16);
-                                       if (generated != emitted) {
-                                               acc_result = 1;
-                                               result_remaining = 0;
-                                               break;
-                                       }
-
-                                       stream_offset += 4;
-                               } while (--count);
-                               */
-                               "BGNLOOP\n"
-                                       "UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"
-                                       "LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n"
-                                       "LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n"
-                                       "USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n"
-                                       "UIF TEMP[5]\n"
-                                               "MOV TEMP[0].x, IMM[1].xxxx\n"
-                                               "MOV TEMP[1].y, IMM[0].xxxx\n"
-                                               "BRK\n"
-                                       "ENDIF\n"
-
-                                       "UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n"
-                                       "USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n"
-                                       "UIF TEMP[5]\n"
-                                               "BRK\n"
-                                       "ENDIF\n"
-                                       "UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n"
-                               "ENDLOOP\n"
-                       "ENDIF\n"
-
-               /*
-                       base_offset += 64;
-               } // end outer loop
-               */
-                       "UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n"
-               "ENDLOOP\n"
-
-               /*
-               if (chain & 2) {
-                       buffer[2][0] = acc_result;
-                       buffer[2][1] = acc_missing;
-               } else {
-               */
-               "AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n"
-               "UIF TEMP[5]\n"
-                       "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n"
-               "ELSE\n"
-
-                       /*
-                       if ((config & 7) == 1) {
-                               acc_result = acc_missing ? 0 : 1;
-                               acc_missing = 0;
-                       }
-                       */
-                       "AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n"
-                       "USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n"
-                       "UIF TEMP[5]\n"
-                               "UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n"
-                               "MOV TEMP[0].y, IMM[0].xxxx\n"
-                       "ENDIF\n"
-
-                       /*
-                       if (!acc_missing) {
-                               buffer[2][0] = acc_result;
-                               if (config & 8)
-                                       buffer[2][1] = 0;
-                       }
-                       */
-                       "USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n"
-                       "UIF TEMP[5]\n"
-                               "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
-
-                               "AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n"
-                               "UIF TEMP[5]\n"
-                                       "STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n"
-                               "ENDIF\n"
-                       "ENDIF\n"
-               "ENDIF\n"
-
-               "END\n";
-
-       struct tgsi_token tokens[1024];
-       struct pipe_compute_state state = {};
-
-       if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) {
-               assert(false);
-               return NULL;
-       }
-
-       state.ir_type = PIPE_SHADER_IR_TGSI;
-       state.prog = tokens;
-
-       return sctx->b.create_compute_state(&sctx->b, &state);
+   /* TEMP[0].x = accumulated result so far
+    * TEMP[0].y = result missing
+    * TEMP[0].z = whether we're in overflow mode
+    */
+   static const char text_tmpl[] = "COMP\n"
+                                   "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
+                                   "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+                                   "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+                                   "DCL BUFFER[0]\n"
+                                   "DCL BUFFER[1]\n"
+                                   "DCL BUFFER[2]\n"
+                                   "DCL CONST[0][0..0]\n"
+                                   "DCL TEMP[0..5]\n"
+                                   "IMM[0] UINT32 {0, 7, 0, 4294967295}\n"
+                                   "IMM[1] UINT32 {1, 2, 4, 8}\n"
+                                   "IMM[2] UINT32 {16, 32, 64, 128}\n"
+
+                                   /*
+                                   acc_result = 0;
+                                   acc_missing = 0;
+                                   if (chain & 1) {
+                                           acc_result = buffer[1][0];
+                                           acc_missing = buffer[1][1];
+                                   }
+                                   */
+                                   "MOV TEMP[0].xy, IMM[0].xxxx\n"
+                                   "AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n"
+                                   "UIF TEMP[5]\n"
+                                   "LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n"
+                                   "ENDIF\n"
+
+                                   /*
+                                   is_overflow (TEMP[0].z) = (config & 7) >= 2;
+                                   result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 :
+                                   result_count; base_offset (TEMP[1].y) = 0; for (;;) { if
+                                   (!result_remaining) break; result_remaining--;
+                                   */
+                                   "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+                                   "USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n"
+
+                                   "AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n"
+                                   "UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n"
+                                   "MOV TEMP[1].y, IMM[0].xxxx\n"
+
+                                   "BGNLOOP\n"
+                                   "USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n"
+                                   "UIF TEMP[5]\n"
+                                   "BRK\n"
+                                   "ENDIF\n"
+                                   "UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"
+
+                                   /*
+                                   fence = buffer[0]@(base_offset + 32);
+                                   if (!fence) {
+                                           acc_missing = ~0u;
+                                           break;
+                                   }
+                                   */
+                                   "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n"
+                                   "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
+                                   "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
+                                   "UIF TEMP[5]\n"
+                                   "MOV TEMP[0].y, TEMP[5].xxxx\n"
+                                   "BRK\n"
+                                   "ENDIF\n"
+
+                                   /*
+                                   stream_offset (TEMP[2].x) = base_offset + offset;
+
+                                   if (!(config & 7)) {
+                                           acc_result += buffer[0]@stream_offset;
+                                   }
+                                   */
+                                   "UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n"
+
+                                   "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+                                   "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
+                                   "UIF TEMP[5]\n"
+                                   "LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n"
+                                   "UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n"
+                                   "ENDIF\n"
+
+                                   /*
+                                   if ((config & 7) >= 2) {
+                                           count (TEMP[2].y) = (config & 1) ? 4 : 1;
+                                   */
+                                   "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+                                   "USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n"
+                                   "UIF TEMP[5]\n"
+                                   "AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n"
+                                   "UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n"
+
+                                   /*
+                                   do {
+                                           generated = buffer[0]@stream_offset;
+                                           emitted = buffer[0]@(stream_offset + 16);
+                                           if (generated != emitted) {
+                                                   acc_result = 1;
+                                                   result_remaining = 0;
+                                                   break;
+                                           }
+
+                                           stream_offset += 4;
+                                   } while (--count);
+                                   */
+                                   "BGNLOOP\n"
+                                   "UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"
+                                   "LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n"
+                                   "LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n"
+                                   "USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n"
+                                   "UIF TEMP[5]\n"
+                                   "MOV TEMP[0].x, IMM[1].xxxx\n"
+                                   "MOV TEMP[1].y, IMM[0].xxxx\n"
+                                   "BRK\n"
+                                   "ENDIF\n"
+
+                                   "UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n"
+                                   "USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n"
+                                   "UIF TEMP[5]\n"
+                                   "BRK\n"
+                                   "ENDIF\n"
+                                   "UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n"
+                                   "ENDLOOP\n"
+                                   "ENDIF\n"
+
+                                   /*
+                                           base_offset += 64;
+                                   } // end outer loop
+                                   */
+                                   "UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n"
+                                   "ENDLOOP\n"
+
+                                   /*
+                                   if (chain & 2) {
+                                           buffer[2][0] = acc_result;
+                                           buffer[2][1] = acc_missing;
+                                   } else {
+                                   */
+                                   "AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n"
+                                   "UIF TEMP[5]\n"
+                                   "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n"
+                                   "ELSE\n"
+
+                                   /*
+                                   if ((config & 7) == 1) {
+                                           acc_result = acc_missing ? 0 : 1;
+                                           acc_missing = 0;
+                                   }
+                                   */
+                                   "AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n"
+                                   "USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n"
+                                   "UIF TEMP[5]\n"
+                                   "UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n"
+                                   "MOV TEMP[0].y, IMM[0].xxxx\n"
+                                   "ENDIF\n"
+
+                                   /*
+                                   if (!acc_missing) {
+                                           buffer[2][0] = acc_result;
+                                           if (config & 8)
+                                                   buffer[2][1] = 0;
+                                   }
+                                   */
+                                   "USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n"
+                                   "UIF TEMP[5]\n"
+                                   "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+
+                                   "AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n"
+                                   "UIF TEMP[5]\n"
+                                   "STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n"
+                                   "ENDIF\n"
+                                   "ENDIF\n"
+                                   "ENDIF\n"
+
+                                   "END\n";
+
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {};
+
+   if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return sctx->b.create_compute_state(&sctx->b, &state);
  }
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c

index 46d7c71b2de6fee05b5da1a43329a15b4b4de514..60aa08655022815557c97b246db8da5e63dc02b3 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -23,51 +23,49 @@
   */
  
  #include "si_build_pm4.h"
-#include "sid.h"
  #include "si_query.h"
-
-#include "util/u_dual_blend.h"
+#include "sid.h"
+#include "util/fast_idiv_by_const.h"
  #include "util/format/u_format.h"
  #include "util/format/u_format_s3tc.h"
+#include "util/u_dual_blend.h"
  #include "util/u_memory.h"
  #include "util/u_resource.h"
  #include "util/u_upload_mgr.h"
-#include "util/fast_idiv_by_const.h"
  
  struct gfx10_format {
-    unsigned img_format:9;
+   unsigned img_format : 9;
  
-    /* Various formats are only supported with workarounds for vertex fetch,
-     * and some 32_32_32 formats are supported natively, but only for buffers
-     * (possibly with some image support, actually, but no filtering). */
-    bool buffers_only:1;
+   /* Various formats are only supported with workarounds for vertex fetch,
+    * and some 32_32_32 formats are supported natively, but only for buffers
+    * (possibly with some image support, actually, but no filtering). */
+   bool buffers_only : 1;
  };
  
  #include "gfx10_format_table.h"
  
  static unsigned si_map_swizzle(unsigned swizzle)
  {
-       switch (swizzle) {
-       case PIPE_SWIZZLE_Y:
-               return V_008F0C_SQ_SEL_Y;
-       case PIPE_SWIZZLE_Z:
-               return V_008F0C_SQ_SEL_Z;
-       case PIPE_SWIZZLE_W:
-               return V_008F0C_SQ_SEL_W;
-       case PIPE_SWIZZLE_0:
-               return V_008F0C_SQ_SEL_0;
-       case PIPE_SWIZZLE_1:
-               return V_008F0C_SQ_SEL_1;
-       default: /* PIPE_SWIZZLE_X */
-               return V_008F0C_SQ_SEL_X;
-       }
+   switch (swizzle) {
+   case PIPE_SWIZZLE_Y:
+      return V_008F0C_SQ_SEL_Y;
+   case PIPE_SWIZZLE_Z:
+      return V_008F0C_SQ_SEL_Z;
+   case PIPE_SWIZZLE_W:
+      return V_008F0C_SQ_SEL_W;
+   case PIPE_SWIZZLE_0:
+      return V_008F0C_SQ_SEL_0;
+   case PIPE_SWIZZLE_1:
+      return V_008F0C_SQ_SEL_1;
+   default: /* PIPE_SWIZZLE_X */
+      return V_008F0C_SQ_SEL_X;
+   }
  }
  
  /* 12.4 fixed-point */
  static unsigned si_pack_float_12p4(float x)
  {
-       return x <= 0    ? 0 :
-              x >= 4096 ? 0xffff : x * 16;
+   return x <= 0 ? 0 : x >= 4096 ? 0xffff : x * 16;
  }
  
  /*
@@ -78,202 +76,191 @@ static unsigned si_pack_float_12p4(float x)
   */
  static void si_emit_cb_render_state(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       struct si_state_blend *blend = sctx->queued.named.blend;
-       /* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers,
-        * but you never know. */
-       uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit &
-                                 blend->cb_target_mask;
-       unsigned i;
-
-       /* Avoid a hang that happens when dual source blending is enabled
-        * but there is not enough color outputs. This is undefined behavior,
-        * so disable color writes completely.
-        *
-        * Reproducible with Unigine Heaven 4.0 and drirc missing.
-        */
-       if (blend->dual_src_blend &&
-           sctx->ps_shader.cso &&
-           (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3)
-               cb_target_mask = 0;
-
-       /* GFX9: Flush DFSM when CB_TARGET_MASK changes.
-        * I think we don't have to do anything between IBs.
-        */
-       if (sctx->screen->dpbb_allowed &&
-           sctx->last_cb_target_mask != cb_target_mask) {
-               sctx->last_cb_target_mask = cb_target_mask;
-
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
-       }
-
-       unsigned initial_cdw = cs->current.cdw;
-       radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK,
-                                  SI_TRACKED_CB_TARGET_MASK, cb_target_mask);
-
-       if (sctx->chip_class >= GFX8) {
-               /* DCC MSAA workaround.
-                * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_-
-                * COMBINER_DISABLE, but that would be more complicated.
-                */
-               bool oc_disable = blend->dcc_msaa_corruption_4bit & cb_target_mask &&
-                                 sctx->framebuffer.nr_samples >= 2;
-               unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark;
-
-               radeon_opt_set_context_reg(
-                               sctx, R_028424_CB_DCC_CONTROL,
-                               SI_TRACKED_CB_DCC_CONTROL,
-                               S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) |
-                               S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
-                               S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) |
-                               S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode));
-       }
-
-       /* RB+ register settings. */
-       if (sctx->screen->info.rbplus_allowed) {
-               unsigned spi_shader_col_format =
-                       sctx->ps_shader.cso ?
-                       sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format : 0;
-               unsigned sx_ps_downconvert = 0;
-               unsigned sx_blend_opt_epsilon = 0;
-               unsigned sx_blend_opt_control = 0;
-
-               for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-                       struct si_surface *surf =
-                               (struct si_surface*)sctx->framebuffer.state.cbufs[i];
-                       unsigned format, swap, spi_format, colormask;
-                       bool has_alpha, has_rgb;
-
-                       if (!surf) {
-                               /* If the color buffer is not set, the driver sets 32_R
-                                * as the SPI color format, because the hw doesn't allow
-                                * holes between color outputs, so also set this to
-                                * enable RB+.
-                                */
-                               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
-                               continue;
-                       }
-
-                       format = G_028C70_FORMAT(surf->cb_color_info);
-                       swap = G_028C70_COMP_SWAP(surf->cb_color_info);
-                       spi_format = (spi_shader_col_format >> (i * 4)) & 0xf;
-                       colormask = (cb_target_mask >> (i * 4)) & 0xf;
-
-                       /* Set if RGB and A are present. */
-                       has_alpha = !G_028C74_FORCE_DST_ALPHA_1(surf->cb_color_attrib);
-
-                       if (format == V_028C70_COLOR_8 ||
-                           format == V_028C70_COLOR_16 ||
-                           format == V_028C70_COLOR_32)
-                               has_rgb = !has_alpha;
-                       else
-                               has_rgb = true;
-
-                       /* Check the colormask and export format. */
-                       if (!(colormask & (PIPE_MASK_RGBA & ~PIPE_MASK_A)))
-                               has_rgb = false;
-                       if (!(colormask & PIPE_MASK_A))
-                               has_alpha = false;
-
-                       if (spi_format == V_028714_SPI_SHADER_ZERO) {
-                               has_rgb = false;
-                               has_alpha = false;
-                       }
-
-                       /* Disable value checking for disabled channels. */
-                       if (!has_rgb)
-                               sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
-                       if (!has_alpha)
-                               sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
-
-                       /* Enable down-conversion for 32bpp and smaller formats. */
-                       switch (format) {
-                       case V_028C70_COLOR_8:
-                       case V_028C70_COLOR_8_8:
-                       case V_028C70_COLOR_8_8_8_8:
-                               /* For 1 and 2-channel formats, use the superset thereof. */
-                               if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
-                                   spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
-                                   spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
-                                       sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
-                                       sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
-                               }
-                               break;
-
-                       case V_028C70_COLOR_5_6_5:
-                               if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
-                                       sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
-                                       sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
-                               }
-                               break;
-
-                       case V_028C70_COLOR_1_5_5_5:
-                               if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
-                                       sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
-                                       sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
-                               }
-                               break;
-
-                       case V_028C70_COLOR_4_4_4_4:
-                               if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
-                                       sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
-                                       sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
-                               }
-                               break;
-
-                       case V_028C70_COLOR_32:
-                               if (swap == V_028C70_SWAP_STD &&
-                                   spi_format == V_028714_SPI_SHADER_32_R)
-                                       sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
-                               else if (swap == V_028C70_SWAP_ALT_REV &&
-                                        spi_format == V_028714_SPI_SHADER_32_AR)
-                                       sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
-                               break;
-
-                       case V_028C70_COLOR_16:
-                       case V_028C70_COLOR_16_16:
-                               /* For 1-channel formats, use the superset thereof. */
-                               if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
-                                   spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
-                                   spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
-                                   spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
-                                       if (swap == V_028C70_SWAP_STD ||
-                                           swap == V_028C70_SWAP_STD_REV)
-                                               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
-                                       else
-                                               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
-                               }
-                               break;
-
-                       case V_028C70_COLOR_10_11_11:
-                               if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
-                                       sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
-                               break;
-
-                       case V_028C70_COLOR_2_10_10_10:
-                               if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
-                                       sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
-                                       sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
-                               }
-                               break;
-                       }
-               }
-
-               /* If there are no color outputs, the first color export is
-                * always enabled as 32_R, so also set this to enable RB+.
-                */
-               if (!sx_ps_downconvert)
-                       sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R;
-
-               /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */
-               radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT,
-                                           SI_TRACKED_SX_PS_DOWNCONVERT,
-                                           sx_ps_downconvert, sx_blend_opt_epsilon,
-                                           sx_blend_opt_control);
-       }
-       if (initial_cdw != cs->current.cdw)
-               sctx->context_roll = true;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct si_state_blend *blend = sctx->queued.named.blend;
+   /* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers,
+    * but you never know. */
+   uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_mask;
+   unsigned i;
+
+   /* Avoid a hang that happens when dual source blending is enabled
+    * but there is not enough color outputs. This is undefined behavior,
+    * so disable color writes completely.
+    *
+    * Reproducible with Unigine Heaven 4.0 and drirc missing.
+    */
+   if (blend->dual_src_blend && sctx->ps_shader.cso &&
+       (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3)
+      cb_target_mask = 0;
+
+   /* GFX9: Flush DFSM when CB_TARGET_MASK changes.
+    * I think we don't have to do anything between IBs.
+    */
+   if (sctx->screen->dpbb_allowed && sctx->last_cb_target_mask != cb_target_mask) {
+      sctx->last_cb_target_mask = cb_target_mask;
+
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+   }
+
+   unsigned initial_cdw = cs->current.cdw;
+   radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK,
+                              cb_target_mask);
+
+   if (sctx->chip_class >= GFX8) {
+      /* DCC MSAA workaround.
+       * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_-
+       * COMBINER_DISABLE, but that would be more complicated.
+       */
+      bool oc_disable =
+         blend->dcc_msaa_corruption_4bit & cb_target_mask && sctx->framebuffer.nr_samples >= 2;
+      unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark;
+
+      radeon_opt_set_context_reg(
+         sctx, R_028424_CB_DCC_CONTROL, SI_TRACKED_CB_DCC_CONTROL,
+         S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) |
+            S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
+            S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) |
+            S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode));
+   }
+
+   /* RB+ register settings. */
+   if (sctx->screen->info.rbplus_allowed) {
+      unsigned spi_shader_col_format =
+         sctx->ps_shader.cso ? sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format
+                             : 0;
+      unsigned sx_ps_downconvert = 0;
+      unsigned sx_blend_opt_epsilon = 0;
+      unsigned sx_blend_opt_control = 0;
+
+      for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+         struct si_surface *surf = (struct si_surface *)sctx->framebuffer.state.cbufs[i];
+         unsigned format, swap, spi_format, colormask;
+         bool has_alpha, has_rgb;
+
+         if (!surf) {
+            /* If the color buffer is not set, the driver sets 32_R
+             * as the SPI color format, because the hw doesn't allow
+             * holes between color outputs, so also set this to
+             * enable RB+.
+             */
+            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
+            continue;
+         }
+
+         format = G_028C70_FORMAT(surf->cb_color_info);
+         swap = G_028C70_COMP_SWAP(surf->cb_color_info);
+         spi_format = (spi_shader_col_format >> (i * 4)) & 0xf;
+         colormask = (cb_target_mask >> (i * 4)) & 0xf;
+
+         /* Set if RGB and A are present. */
+         has_alpha = !G_028C74_FORCE_DST_ALPHA_1(surf->cb_color_attrib);
+
+         if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 ||
+             format == V_028C70_COLOR_32)
+            has_rgb = !has_alpha;
+         else
+            has_rgb = true;
+
+         /* Check the colormask and export format. */
+         if (!(colormask & (PIPE_MASK_RGBA & ~PIPE_MASK_A)))
+            has_rgb = false;
+         if (!(colormask & PIPE_MASK_A))
+            has_alpha = false;
+
+         if (spi_format == V_028714_SPI_SHADER_ZERO) {
+            has_rgb = false;
+            has_alpha = false;
+         }
+
+         /* Disable value checking for disabled channels. */
+         if (!has_rgb)
+            sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
+         if (!has_alpha)
+            sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
+
+         /* Enable down-conversion for 32bpp and smaller formats. */
+         switch (format) {
+         case V_028C70_COLOR_8:
+         case V_028C70_COLOR_8_8:
+         case V_028C70_COLOR_8_8_8_8:
+            /* For 1 and 2-channel formats, use the superset thereof. */
+            if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
+                spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
+                spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
+               sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
+            }
+            break;
+
+         case V_028C70_COLOR_5_6_5:
+            if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
+               sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
+            }
+            break;
+
+         case V_028C70_COLOR_1_5_5_5:
+            if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
+               sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
+            }
+            break;
+
+         case V_028C70_COLOR_4_4_4_4:
+            if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
+               sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
+            }
+            break;
+
+         case V_028C70_COLOR_32:
+            if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
+            else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
+            break;
+
+         case V_028C70_COLOR_16:
+         case V_028C70_COLOR_16_16:
+            /* For 1-channel formats, use the superset thereof. */
+            if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
+                spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
+                spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
+                spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
+               if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
+                  sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
+               else
+                  sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
+            }
+            break;
+
+         case V_028C70_COLOR_10_11_11:
+            if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
+            break;
+
+         case V_028C70_COLOR_2_10_10_10:
+            if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
+               sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
+            }
+            break;
+         }
+      }
+
+      /* If there are no color outputs, the first color export is
+       * always enabled as 32_R, so also set this to enable RB+.
+       */
+      if (!sx_ps_downconvert)
+         sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R;
+
+      /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */
+      radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT,
+                                  sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control);
+   }
+   if (initial_cdw != cs->current.cdw)
+      sctx->context_roll = true;
  }
  
  /*
@@ -282,551 +269,507 @@ static void si_emit_cb_render_state(struct si_context *sctx)
  
  static uint32_t si_translate_blend_function(int blend_func)
  {
-       switch (blend_func) {
-       case PIPE_BLEND_ADD:
-               return V_028780_COMB_DST_PLUS_SRC;
-       case PIPE_BLEND_SUBTRACT:
-               return V_028780_COMB_SRC_MINUS_DST;
-       case PIPE_BLEND_REVERSE_SUBTRACT:
-               return V_028780_COMB_DST_MINUS_SRC;
-       case PIPE_BLEND_MIN:
-               return V_028780_COMB_MIN_DST_SRC;
-       case PIPE_BLEND_MAX:
-               return V_028780_COMB_MAX_DST_SRC;
-       default:
-               PRINT_ERR("Unknown blend function %d\n", blend_func);
-               assert(0);
-               break;
-       }
-       return 0;
+   switch (blend_func) {
+   case PIPE_BLEND_ADD:
+      return V_028780_COMB_DST_PLUS_SRC;
+   case PIPE_BLEND_SUBTRACT:
+      return V_028780_COMB_SRC_MINUS_DST;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return V_028780_COMB_DST_MINUS_SRC;
+   case PIPE_BLEND_MIN:
+      return V_028780_COMB_MIN_DST_SRC;
+   case PIPE_BLEND_MAX:
+      return V_028780_COMB_MAX_DST_SRC;
+   default:
+      PRINT_ERR("Unknown blend function %d\n", blend_func);
+      assert(0);
+      break;
+   }
+   return 0;
  }
  
  static uint32_t si_translate_blend_factor(int blend_fact)
  {
-       switch (blend_fact) {
-       case PIPE_BLENDFACTOR_ONE:
-               return V_028780_BLEND_ONE;
-       case PIPE_BLENDFACTOR_SRC_COLOR:
-               return V_028780_BLEND_SRC_COLOR;
-       case PIPE_BLENDFACTOR_SRC_ALPHA:
-               return V_028780_BLEND_SRC_ALPHA;
-       case PIPE_BLENDFACTOR_DST_ALPHA:
-               return V_028780_BLEND_DST_ALPHA;
-       case PIPE_BLENDFACTOR_DST_COLOR:
-               return V_028780_BLEND_DST_COLOR;
-       case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-               return V_028780_BLEND_SRC_ALPHA_SATURATE;
-       case PIPE_BLENDFACTOR_CONST_COLOR:
-               return V_028780_BLEND_CONSTANT_COLOR;
-       case PIPE_BLENDFACTOR_CONST_ALPHA:
-               return V_028780_BLEND_CONSTANT_ALPHA;
-       case PIPE_BLENDFACTOR_ZERO:
-               return V_028780_BLEND_ZERO;
-       case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-               return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
-       case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-               return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
-       case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-               return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
-       case PIPE_BLENDFACTOR_INV_DST_COLOR:
-               return V_028780_BLEND_ONE_MINUS_DST_COLOR;
-       case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-               return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR;
-       case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-               return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA;
-       case PIPE_BLENDFACTOR_SRC1_COLOR:
-               return V_028780_BLEND_SRC1_COLOR;
-       case PIPE_BLENDFACTOR_SRC1_ALPHA:
-               return V_028780_BLEND_SRC1_ALPHA;
-       case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-               return V_028780_BLEND_INV_SRC1_COLOR;
-       case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-               return V_028780_BLEND_INV_SRC1_ALPHA;
-       default:
-               PRINT_ERR("Bad blend factor %d not supported!\n", blend_fact);
-               assert(0);
-               break;
-       }
-       return 0;
+   switch (blend_fact) {
+   case PIPE_BLENDFACTOR_ONE:
+      return V_028780_BLEND_ONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return V_028780_BLEND_SRC_COLOR;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return V_028780_BLEND_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return V_028780_BLEND_DST_ALPHA;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      return V_028780_BLEND_DST_COLOR;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      return V_028780_BLEND_SRC_ALPHA_SATURATE;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return V_028780_BLEND_CONSTANT_COLOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return V_028780_BLEND_CONSTANT_ALPHA;
+   case PIPE_BLENDFACTOR_ZERO:
+      return V_028780_BLEND_ZERO;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      return V_028780_BLEND_ONE_MINUS_DST_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      return V_028780_BLEND_SRC1_COLOR;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      return V_028780_BLEND_SRC1_ALPHA;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      return V_028780_BLEND_INV_SRC1_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      return V_028780_BLEND_INV_SRC1_ALPHA;
+   default:
+      PRINT_ERR("Bad blend factor %d not supported!\n", blend_fact);
+      assert(0);
+      break;
+   }
+   return 0;
  }
  
  static uint32_t si_translate_blend_opt_function(int blend_func)
  {
-       switch (blend_func) {
-       case PIPE_BLEND_ADD:
-               return V_028760_OPT_COMB_ADD;
-       case PIPE_BLEND_SUBTRACT:
-               return V_028760_OPT_COMB_SUBTRACT;
-       case PIPE_BLEND_REVERSE_SUBTRACT:
-               return V_028760_OPT_COMB_REVSUBTRACT;
-       case PIPE_BLEND_MIN:
-               return V_028760_OPT_COMB_MIN;
-       case PIPE_BLEND_MAX:
-               return V_028760_OPT_COMB_MAX;
-       default:
-               return V_028760_OPT_COMB_BLEND_DISABLED;
-       }
+   switch (blend_func) {
+   case PIPE_BLEND_ADD:
+      return V_028760_OPT_COMB_ADD;
+   case PIPE_BLEND_SUBTRACT:
+      return V_028760_OPT_COMB_SUBTRACT;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return V_028760_OPT_COMB_REVSUBTRACT;
+   case PIPE_BLEND_MIN:
+      return V_028760_OPT_COMB_MIN;
+   case PIPE_BLEND_MAX:
+      return V_028760_OPT_COMB_MAX;
+   default:
+      return V_028760_OPT_COMB_BLEND_DISABLED;
+   }
  }
  
  static uint32_t si_translate_blend_opt_factor(int blend_fact, bool is_alpha)
  {
-       switch (blend_fact) {
-       case PIPE_BLENDFACTOR_ZERO:
-               return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
-       case PIPE_BLENDFACTOR_ONE:
-               return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
-       case PIPE_BLENDFACTOR_SRC_COLOR:
-               return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
-                               : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
-       case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-               return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
-                               : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
-       case PIPE_BLENDFACTOR_SRC_ALPHA:
-               return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
-       case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-               return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
-       case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-               return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
-                               : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
-       default:
-               return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
-       }
+   switch (blend_fact) {
+   case PIPE_BLENDFACTOR_ZERO:
+      return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
+   case PIPE_BLENDFACTOR_ONE:
+      return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
+                      : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
+                      : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
+                      : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
+   default:
+      return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+   }
  }
  
-static void si_blend_check_commutativity(struct si_screen *sscreen,
-                                        struct si_state_blend *blend,
-                                        enum pipe_blend_func func,
-                                        enum pipe_blendfactor src,
-                                        enum pipe_blendfactor dst,
-                                        unsigned chanmask)
+static void si_blend_check_commutativity(struct si_screen *sscreen, struct si_state_blend *blend,
+                                         enum pipe_blend_func func, enum pipe_blendfactor src,
+                                         enum pipe_blendfactor dst, unsigned chanmask)
  {
-       /* Src factor is allowed when it does not depend on Dst */
-       static const uint32_t src_allowed =
-               (1u << PIPE_BLENDFACTOR_ONE) |
-               (1u << PIPE_BLENDFACTOR_SRC_COLOR) |
-               (1u << PIPE_BLENDFACTOR_SRC_ALPHA) |
-               (1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) |
-               (1u << PIPE_BLENDFACTOR_CONST_COLOR) |
-               (1u << PIPE_BLENDFACTOR_CONST_ALPHA) |
-               (1u << PIPE_BLENDFACTOR_SRC1_COLOR) |
-               (1u << PIPE_BLENDFACTOR_SRC1_ALPHA) |
-               (1u << PIPE_BLENDFACTOR_ZERO) |
-               (1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) |
-               (1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) |
-               (1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) |
-               (1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) |
-               (1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) |
-               (1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA);
-
-       if (dst == PIPE_BLENDFACTOR_ONE &&
-           (src_allowed & (1u << src))) {
-               /* Addition is commutative, but floating point addition isn't
-                * associative: subtle changes can be introduced via different
-                * rounding.
-                *
-                * Out-of-order is also non-deterministic, which means that
-                * this breaks OpenGL invariance requirements. So only enable
-                * out-of-order additive blending if explicitly allowed by a
-                * setting.
-                */
-               if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN ||
-                   (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add))
-                       blend->commutative_4bit |= chanmask;
-       }
+   /* Src factor is allowed when it does not depend on Dst */
+   static const uint32_t src_allowed =
+      (1u << PIPE_BLENDFACTOR_ONE) | (1u << PIPE_BLENDFACTOR_SRC_COLOR) |
+      (1u << PIPE_BLENDFACTOR_SRC_ALPHA) | (1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) |
+      (1u << PIPE_BLENDFACTOR_CONST_COLOR) | (1u << PIPE_BLENDFACTOR_CONST_ALPHA) |
+      (1u << PIPE_BLENDFACTOR_SRC1_COLOR) | (1u << PIPE_BLENDFACTOR_SRC1_ALPHA) |
+      (1u << PIPE_BLENDFACTOR_ZERO) | (1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) |
+      (1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) | (1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) |
+      (1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) | (1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) |
+      (1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA);
+
+   if (dst == PIPE_BLENDFACTOR_ONE && (src_allowed & (1u << src))) {
+      /* Addition is commutative, but floating point addition isn't
+       * associative: subtle changes can be introduced via different
+       * rounding.
+       *
+       * Out-of-order is also non-deterministic, which means that
+       * this breaks OpenGL invariance requirements. So only enable
+       * out-of-order additive blending if explicitly allowed by a
+       * setting.
+       */
+      if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN ||
+          (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add))
+         blend->commutative_4bit |= chanmask;
+   }
  }
  
  /**
   * Get rid of DST in the blend factors by commuting the operands:
   *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
   */
-static void si_blend_remove_dst(unsigned *func, unsigned *src_factor,
-                               unsigned *dst_factor, unsigned expected_dst,
-                               unsigned replacement_src)
+static void si_blend_remove_dst(unsigned *func, unsigned *src_factor, unsigned *dst_factor,
+                                unsigned expected_dst, unsigned replacement_src)
  {
-       if (*src_factor == expected_dst &&
-           *dst_factor == PIPE_BLENDFACTOR_ZERO) {
-               *src_factor = PIPE_BLENDFACTOR_ZERO;
-               *dst_factor = replacement_src;
-
-               /* Commuting the operands requires reversing subtractions. */
-               if (*func == PIPE_BLEND_SUBTRACT)
-                       *func = PIPE_BLEND_REVERSE_SUBTRACT;
-               else if (*func == PIPE_BLEND_REVERSE_SUBTRACT)
-                       *func = PIPE_BLEND_SUBTRACT;
-       }
+   if (*src_factor == expected_dst && *dst_factor == PIPE_BLENDFACTOR_ZERO) {
+      *src_factor = PIPE_BLENDFACTOR_ZERO;
+      *dst_factor = replacement_src;
+
+      /* Commuting the operands requires reversing subtractions. */
+      if (*func == PIPE_BLEND_SUBTRACT)
+         *func = PIPE_BLEND_REVERSE_SUBTRACT;
+      else if (*func == PIPE_BLEND_REVERSE_SUBTRACT)
+         *func = PIPE_BLEND_SUBTRACT;
+   }
  }
  
  static bool si_blend_factor_uses_dst(unsigned factor)
  {
-       return factor == PIPE_BLENDFACTOR_DST_COLOR ||
-               factor == PIPE_BLENDFACTOR_DST_ALPHA ||
-               factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
-               factor == PIPE_BLENDFACTOR_INV_DST_ALPHA ||
-               factor == PIPE_BLENDFACTOR_INV_DST_COLOR;
+   return factor == PIPE_BLENDFACTOR_DST_COLOR || factor == PIPE_BLENDFACTOR_DST_ALPHA ||
+          factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+          factor == PIPE_BLENDFACTOR_INV_DST_ALPHA || factor == PIPE_BLENDFACTOR_INV_DST_COLOR;
  }
  
  static void *si_create_blend_state_mode(struct pipe_context *ctx,
-                                       const struct pipe_blend_state *state,
-                                       unsigned mode)
+                                        const struct pipe_blend_state *state, unsigned mode)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend);
-       struct si_pm4_state *pm4 = &blend->pm4;
-       uint32_t sx_mrt_blend_opt[8] = {0};
-       uint32_t color_control = 0;
-       bool logicop_enable = state->logicop_enable &&
-                             state->logicop_func != PIPE_LOGICOP_COPY;
-
-       if (!blend)
-               return NULL;
-
-       blend->alpha_to_coverage = state->alpha_to_coverage;
-       blend->alpha_to_one = state->alpha_to_one;
-       blend->dual_src_blend = util_blend_state_is_dual(state, 0);
-       blend->logicop_enable = logicop_enable;
-
-       if (logicop_enable) {
-               color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4));
-       } else {
-               color_control |= S_028808_ROP3(0xcc);
-       }
-
-       si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK,
-                      S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) |
-                      S_028B70_ALPHA_TO_MASK_OFFSET0(3) |
-                      S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
-                      S_028B70_ALPHA_TO_MASK_OFFSET2(0) |
-                      S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
-                      S_028B70_OFFSET_ROUND(1));
-
-       if (state->alpha_to_coverage)
-               blend->need_src_alpha_4bit |= 0xf;
-
-       blend->cb_target_mask = 0;
-       blend->cb_target_enabled_4bit = 0;
-
-       for (int i = 0; i < 8; i++) {
-               /* state->rt entries > 0 only written if independent blending */
-               const int j = state->independent_blend_enable ? i : 0;
-
-               unsigned eqRGB = state->rt[j].rgb_func;
-               unsigned srcRGB = state->rt[j].rgb_src_factor;
-               unsigned dstRGB = state->rt[j].rgb_dst_factor;
-               unsigned eqA = state->rt[j].alpha_func;
-               unsigned srcA = state->rt[j].alpha_src_factor;
-               unsigned dstA = state->rt[j].alpha_dst_factor;
-
-               unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
-               unsigned blend_cntl = 0;
-
-               sx_mrt_blend_opt[i] =
-                       S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
-                       S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
-
-               /* Only set dual source blending for MRT0 to avoid a hang. */
-               if (i >= 1 && blend->dual_src_blend) {
-                       /* Vulkan does this for dual source blending. */
-                       if (i == 1)
-                               blend_cntl |= S_028780_ENABLE(1);
-
-                       si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
-                       continue;
-               }
-
-               /* Only addition and subtraction equations are supported with
-                * dual source blending.
-                */
-               if (blend->dual_src_blend &&
-                   (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX ||
-                    eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX)) {
-                       assert(!"Unsupported equation for dual source blending");
-                       si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
-                       continue;
-               }
-
-               /* cb_render_state will disable unused ones */
-               blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * i);
-               if (state->rt[j].colormask)
-                       blend->cb_target_enabled_4bit |= 0xf << (4 * i);
-
-               if (!state->rt[j].colormask || !state->rt[j].blend_enable) {
-                       si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
-                       continue;
-               }
-
-               si_blend_check_commutativity(sctx->screen, blend,
-                                            eqRGB, srcRGB, dstRGB, 0x7 << (4 * i));
-               si_blend_check_commutativity(sctx->screen, blend,
-                                            eqA, srcA, dstA, 0x8 << (4 * i));
-
-               /* Blending optimizations for RB+.
-                * These transformations don't change the behavior.
-                *
-                * First, get rid of DST in the blend factors:
-                *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
-                */
-               si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB,
-                                   PIPE_BLENDFACTOR_DST_COLOR,
-                                   PIPE_BLENDFACTOR_SRC_COLOR);
-               si_blend_remove_dst(&eqA, &srcA, &dstA,
-                                   PIPE_BLENDFACTOR_DST_COLOR,
-                                   PIPE_BLENDFACTOR_SRC_COLOR);
-               si_blend_remove_dst(&eqA, &srcA, &dstA,
-                                   PIPE_BLENDFACTOR_DST_ALPHA,
-                                   PIPE_BLENDFACTOR_SRC_ALPHA);
-
-               /* Look up the ideal settings from tables. */
-               srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
-               dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
-               srcA_opt = si_translate_blend_opt_factor(srcA, true);
-               dstA_opt = si_translate_blend_opt_factor(dstA, true);
-
-               /* Handle interdependencies. */
-               if (si_blend_factor_uses_dst(srcRGB))
-                       dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
-               if (si_blend_factor_uses_dst(srcA))
-                       dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
-
-               if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE &&
-                   (dstRGB == PIPE_BLENDFACTOR_ZERO ||
-                    dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
-                    dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE))
-                       dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
-
-               /* Set the final value. */
-               sx_mrt_blend_opt[i] =
-                       S_028760_COLOR_SRC_OPT(srcRGB_opt) |
-                       S_028760_COLOR_DST_OPT(dstRGB_opt) |
-                       S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
-                       S_028760_ALPHA_SRC_OPT(srcA_opt) |
-                       S_028760_ALPHA_DST_OPT(dstA_opt) |
-                       S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
-
-               /* Set blend state. */
-               blend_cntl |= S_028780_ENABLE(1);
-               blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
-               blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB));
-               blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB));
-
-               if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
-                       blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
-                       blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA));
-                       blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA));
-                       blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA));
-               }
-               si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
-
-               blend->blend_enable_4bit |= 0xfu << (i * 4);
-
-               if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14)
-                       blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4);
-
-               /* This is only important for formats without alpha. */
-               if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
-                   dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
-                   srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
-                   dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
-                   srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
-                   dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
-                       blend->need_src_alpha_4bit |= 0xfu << (i * 4);
-       }
-
-       if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14 && logicop_enable)
-               blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit;
-
-       if (blend->cb_target_mask) {
-               color_control |= S_028808_MODE(mode);
-       } else {
-               color_control |= S_028808_MODE(V_028808_CB_DISABLE);
-       }
-
-       if (sctx->screen->info.rbplus_allowed) {
-               /* Disable RB+ blend optimizations for dual source blending.
-                * Vulkan does this.
-                */
-               if (blend->dual_src_blend) {
-                       for (int i = 0; i < 8; i++) {
-                               sx_mrt_blend_opt[i] =
-                                       S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
-                                       S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
-                       }
-               }
-
-               for (int i = 0; i < 8; i++)
-                       si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4,
-                                      sx_mrt_blend_opt[i]);
-
-               /* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */
-               if (blend->dual_src_blend || logicop_enable ||
-                   mode == V_028808_CB_RESOLVE)
-                       color_control |= S_028808_DISABLE_DUAL_QUAD(1);
-       }
-
-       si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control);
-       return blend;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend);
+   struct si_pm4_state *pm4 = &blend->pm4;
+   uint32_t sx_mrt_blend_opt[8] = {0};
+   uint32_t color_control = 0;
+   bool logicop_enable = state->logicop_enable && state->logicop_func != PIPE_LOGICOP_COPY;
+
+   if (!blend)
+      return NULL;
+
+   blend->alpha_to_coverage = state->alpha_to_coverage;
+   blend->alpha_to_one = state->alpha_to_one;
+   blend->dual_src_blend = util_blend_state_is_dual(state, 0);
+   blend->logicop_enable = logicop_enable;
+
+   if (logicop_enable) {
+      color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4));
+   } else {
+      color_control |= S_028808_ROP3(0xcc);
+   }
+
+   si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK,
+                  S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) |
+                     S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
+                     S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
+                     S_028B70_OFFSET_ROUND(1));
+
+   if (state->alpha_to_coverage)
+      blend->need_src_alpha_4bit |= 0xf;
+
+   blend->cb_target_mask = 0;
+   blend->cb_target_enabled_4bit = 0;
+
+   for (int i = 0; i < 8; i++) {
+      /* state->rt entries > 0 only written if independent blending */
+      const int j = state->independent_blend_enable ? i : 0;
+
+      unsigned eqRGB = state->rt[j].rgb_func;
+      unsigned srcRGB = state->rt[j].rgb_src_factor;
+      unsigned dstRGB = state->rt[j].rgb_dst_factor;
+      unsigned eqA = state->rt[j].alpha_func;
+      unsigned srcA = state->rt[j].alpha_src_factor;
+      unsigned dstA = state->rt[j].alpha_dst_factor;
+
+      unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
+      unsigned blend_cntl = 0;
+
+      sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
+                            S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
+
+      /* Only set dual source blending for MRT0 to avoid a hang. */
+      if (i >= 1 && blend->dual_src_blend) {
+         /* Vulkan does this for dual source blending. */
+         if (i == 1)
+            blend_cntl |= S_028780_ENABLE(1);
+
+         si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
+         continue;
+      }
+
+      /* Only addition and subtraction equations are supported with
+       * dual source blending.
+       */
+      if (blend->dual_src_blend && (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX ||
+                                    eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX)) {
+         assert(!"Unsupported equation for dual source blending");
+         si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
+         continue;
+      }
+
+      /* cb_render_state will disable unused ones */
+      blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * i);
+      if (state->rt[j].colormask)
+         blend->cb_target_enabled_4bit |= 0xf << (4 * i);
+
+      if (!state->rt[j].colormask || !state->rt[j].blend_enable) {
+         si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
+         continue;
+      }
+
+      si_blend_check_commutativity(sctx->screen, blend, eqRGB, srcRGB, dstRGB, 0x7 << (4 * i));
+      si_blend_check_commutativity(sctx->screen, blend, eqA, srcA, dstA, 0x8 << (4 * i));
+
+      /* Blending optimizations for RB+.
+       * These transformations don't change the behavior.
+       *
+       * First, get rid of DST in the blend factors:
+       *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
+       */
+      si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, PIPE_BLENDFACTOR_DST_COLOR,
+                          PIPE_BLENDFACTOR_SRC_COLOR);
+      si_blend_remove_dst(&eqA, &srcA, &dstA, PIPE_BLENDFACTOR_DST_COLOR,
+                          PIPE_BLENDFACTOR_SRC_COLOR);
+      si_blend_remove_dst(&eqA, &srcA, &dstA, PIPE_BLENDFACTOR_DST_ALPHA,
+                          PIPE_BLENDFACTOR_SRC_ALPHA);
+
+      /* Look up the ideal settings from tables. */
+      srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
+      dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
+      srcA_opt = si_translate_blend_opt_factor(srcA, true);
+      dstA_opt = si_translate_blend_opt_factor(dstA, true);
+
+      /* Handle interdependencies. */
+      if (si_blend_factor_uses_dst(srcRGB))
+         dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+      if (si_blend_factor_uses_dst(srcA))
+         dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+
+      if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE &&
+          (dstRGB == PIPE_BLENDFACTOR_ZERO || dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+           dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE))
+         dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
+
+      /* Set the final value. */
+      sx_mrt_blend_opt[i] = S_028760_COLOR_SRC_OPT(srcRGB_opt) |
+                            S_028760_COLOR_DST_OPT(dstRGB_opt) |
+                            S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
+                            S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) |
+                            S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
+
+      /* Set blend state. */
+      blend_cntl |= S_028780_ENABLE(1);
+      blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
+      blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB));
+      blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB));
+
+      if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
+         blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
+         blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA));
+         blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA));
+         blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA));
+      }
+      si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
+
+      blend->blend_enable_4bit |= 0xfu << (i * 4);
+
+      if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14)
+         blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4);
+
+      /* This is only important for formats without alpha. */
+      if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA || dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+          srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+          dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+          srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA || dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
+         blend->need_src_alpha_4bit |= 0xfu << (i * 4);
+   }
+
+   if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14 && logicop_enable)
+      blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit;
+
+   if (blend->cb_target_mask) {
+      color_control |= S_028808_MODE(mode);
+   } else {
+      color_control |= S_028808_MODE(V_028808_CB_DISABLE);
+   }
+
+   if (sctx->screen->info.rbplus_allowed) {
+      /* Disable RB+ blend optimizations for dual source blending.
+       * Vulkan does this.
+       */
+      if (blend->dual_src_blend) {
+         for (int i = 0; i < 8; i++) {
+            sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
+                                  S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
+         }
+      }
+
+      for (int i = 0; i < 8; i++)
+         si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4, sx_mrt_blend_opt[i]);
+
+      /* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */
+      if (blend->dual_src_blend || logicop_enable || mode == V_028808_CB_RESOLVE)
+         color_control |= S_028808_DISABLE_DUAL_QUAD(1);
+   }
+
+   si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control);
+   return blend;
  }
  
-static void *si_create_blend_state(struct pipe_context *ctx,
-                                  const struct pipe_blend_state *state)
+static void *si_create_blend_state(struct pipe_context *ctx, const struct pipe_blend_state *state)
  {
-       return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL);
+   return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL);
  }
  
  static void si_bind_blend_state(struct pipe_context *ctx, void *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_state_blend *old_blend = sctx->queued.named.blend;
-       struct si_state_blend *blend = (struct si_state_blend *)state;
-
-       if (!blend)
-               blend = (struct si_state_blend *)sctx->noop_blend;
-
-       si_pm4_bind_state(sctx, blend, blend);
-
-       if (old_blend->cb_target_mask != blend->cb_target_mask ||
-           old_blend->dual_src_blend != blend->dual_src_blend ||
-           (old_blend->dcc_msaa_corruption_4bit != blend->dcc_msaa_corruption_4bit &&
-            sctx->framebuffer.nr_samples >= 2 &&
-            sctx->screen->dcc_msaa_allowed))
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
-
-       if (old_blend->cb_target_mask != blend->cb_target_mask ||
-           old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
-           old_blend->alpha_to_one != blend->alpha_to_one ||
-           old_blend->dual_src_blend != blend->dual_src_blend ||
-           old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
-           old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
-               sctx->do_update_shaders = true;
-
-       if (sctx->screen->dpbb_allowed &&
-           (old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
-            old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
-            old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit))
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-
-       if (sctx->screen->has_out_of_order_rast &&
-           ((old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
-             old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
-             old_blend->commutative_4bit != blend->commutative_4bit ||
-             old_blend->logicop_enable != blend->logicop_enable)))
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_state_blend *old_blend = sctx->queued.named.blend;
+   struct si_state_blend *blend = (struct si_state_blend *)state;
+
+   if (!blend)
+      blend = (struct si_state_blend *)sctx->noop_blend;
+
+   si_pm4_bind_state(sctx, blend, blend);
+
+   if (old_blend->cb_target_mask != blend->cb_target_mask ||
+       old_blend->dual_src_blend != blend->dual_src_blend ||
+       (old_blend->dcc_msaa_corruption_4bit != blend->dcc_msaa_corruption_4bit &&
+        sctx->framebuffer.nr_samples >= 2 && sctx->screen->dcc_msaa_allowed))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+
+   if (old_blend->cb_target_mask != blend->cb_target_mask ||
+       old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
+       old_blend->alpha_to_one != blend->alpha_to_one ||
+       old_blend->dual_src_blend != blend->dual_src_blend ||
+       old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
+       old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
+      sctx->do_update_shaders = true;
+
+   if (sctx->screen->dpbb_allowed &&
+       (old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
+        old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
+        old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+
+   if (sctx->screen->has_out_of_order_rast &&
+       ((old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
+         old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
+         old_blend->commutative_4bit != blend->commutative_4bit ||
+         old_blend->logicop_enable != blend->logicop_enable)))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
  }
  
  static void si_delete_blend_state(struct pipe_context *ctx, void *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
  
-       if (sctx->queued.named.blend == state)
-               si_bind_blend_state(ctx, sctx->noop_blend);
+   if (sctx->queued.named.blend == state)
+      si_bind_blend_state(ctx, sctx->noop_blend);
  
-       si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state);
+   si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state);
  }
  
-static void si_set_blend_color(struct pipe_context *ctx,
-                              const struct pipe_blend_color *state)
+static void si_set_blend_color(struct pipe_context *ctx, const struct pipe_blend_color *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       static const struct pipe_blend_color zeros;
+   struct si_context *sctx = (struct si_context *)ctx;
+   static const struct pipe_blend_color zeros;
  
-       sctx->blend_color.state = *state;
-       sctx->blend_color.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color);
+   sctx->blend_color.state = *state;
+   sctx->blend_color.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color);
  }
  
  static void si_emit_blend_color(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
  
-       radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
-       radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4);
+   radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
+   radeon_emit_array(cs, (uint32_t *)sctx->blend_color.state.color, 4);
  }
  
  /*
   * Clipping
   */
  
-static void si_set_clip_state(struct pipe_context *ctx,
-                             const struct pipe_clip_state *state)
+static void si_set_clip_state(struct pipe_context *ctx, const struct pipe_clip_state *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct pipe_constant_buffer cb;
-       static const struct pipe_clip_state zeros;
-
-       if (memcmp(&sctx->clip_state.state, state, sizeof(*state)) == 0)
-               return;
-
-       sctx->clip_state.state = *state;
-       sctx->clip_state.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_state);
-
-       cb.buffer = NULL;
-       cb.user_buffer = state->ucp;
-       cb.buffer_offset = 0;
-       cb.buffer_size = 4*4*8;
-       si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &cb);
-       pipe_resource_reference(&cb.buffer, NULL);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_constant_buffer cb;
+   static const struct pipe_clip_state zeros;
+
+   if (memcmp(&sctx->clip_state.state, state, sizeof(*state)) == 0)
+      return;
+
+   sctx->clip_state.state = *state;
+   sctx->clip_state.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_state);
+
+   cb.buffer = NULL;
+   cb.user_buffer = state->ucp;
+   cb.buffer_offset = 0;
+   cb.buffer_size = 4 * 4 * 8;
+   si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &cb);
+   pipe_resource_reference(&cb.buffer, NULL);
  }
  
  static void si_emit_clip_state(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
  
-       radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4);
-       radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4);
+   radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6 * 4);
+   radeon_emit_array(cs, (uint32_t *)sctx->clip_state.state.ucp, 6 * 4);
  }
  
  static void si_emit_clip_regs(struct si_context *sctx)
  {
-       struct si_shader *vs = si_get_vs_state(sctx);
-       struct si_shader_selector *vs_sel = vs->selector;
-       struct si_shader_info *info = &vs_sel->info;
-       struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-       unsigned window_space =
-          info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-       unsigned clipdist_mask = vs_sel->clipdist_mask;
-       unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS;
-       unsigned culldist_mask = vs_sel->culldist_mask;
-       unsigned total_mask;
-
-       if (vs->key.opt.clip_disable) {
-               assert(!info->culldist_writemask);
-               clipdist_mask = 0;
-               culldist_mask = 0;
-       }
-       total_mask = clipdist_mask | culldist_mask;
-
-       /* Clip distances on points have no effect, so need to be implemented
-        * as cull distances. This applies for the clipvertex case as well.
-        *
-        * Setting this for primitives other than points should have no adverse
-        * effects.
-        */
-       clipdist_mask &= rs->clip_plane_enable;
-       culldist_mask |= clipdist_mask;
-
-       unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-       unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
-                             S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) |
-                             clipdist_mask | (culldist_mask << 8);
-
-       if (sctx->chip_class >= GFX10) {
-               radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
-                                              SI_TRACKED_PA_CL_VS_OUT_CNTL__CL,
-                                              pa_cl_cntl,
-                                              ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
-       } else {
-               radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
-                                          SI_TRACKED_PA_CL_VS_OUT_CNTL__CL,
-                                          vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl);
-       }
-       radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL,
-               SI_TRACKED_PA_CL_CLIP_CNTL,
-               rs->pa_cl_clip_cntl |
-               ucp_mask |
-               S_028810_CLIP_DISABLE(window_space));
-
-       if (initial_cdw != sctx->gfx_cs->current.cdw)
-               sctx->context_roll = true;
+   struct si_shader *vs = si_get_vs_state(sctx);
+   struct si_shader_selector *vs_sel = vs->selector;
+   struct si_shader_info *info = &vs_sel->info;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+   unsigned clipdist_mask = vs_sel->clipdist_mask;
+   unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS;
+   unsigned culldist_mask = vs_sel->culldist_mask;
+   unsigned total_mask;
+
+   if (vs->key.opt.clip_disable) {
+      assert(!info->culldist_writemask);
+      clipdist_mask = 0;
+      culldist_mask = 0;
+   }
+   total_mask = clipdist_mask | culldist_mask;
+
+   /* Clip distances on points have no effect, so need to be implemented
+    * as cull distances. This applies for the clipvertex case as well.
+    *
+    * Setting this for primitives other than points should have no adverse
+    * effects.
+    */
+   clipdist_mask &= rs->clip_plane_enable;
+   culldist_mask |= clipdist_mask;
+
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
+                         S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | clipdist_mask |
+                         (culldist_mask << 8);
+
+   if (sctx->chip_class >= GFX10) {
+      radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
+                                     SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, pa_cl_cntl,
+                                     ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
+   } else {
+      radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL__CL,
+                                 vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl);
+   }
+   radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL,
+                              rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space));
+
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
  }
  
  /*
@@ -834,28 +777,28 @@ static void si_emit_clip_regs(struct si_context *sctx)
   */
  static void si_update_poly_offset_state(struct si_context *sctx)
  {
-       struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-
-       if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) {
-               si_pm4_bind_state(sctx, poly_offset, NULL);
-               return;
-       }
-
-       /* Use the user format, not db_render_format, so that the polygon
-        * offset behaves as expected by applications.
-        */
-       switch (sctx->framebuffer.state.zsbuf->texture->format) {
-       case PIPE_FORMAT_Z16_UNORM:
-               si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]);
-               break;
-       default: /* 24-bit */
-               si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]);
-               break;
-       case PIPE_FORMAT_Z32_FLOAT:
-       case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-               si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]);
-               break;
-       }
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+
+   if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) {
+      si_pm4_bind_state(sctx, poly_offset, NULL);
+      return;
+   }
+
+   /* Use the user format, not db_render_format, so that the polygon
+    * offset behaves as expected by applications.
+    */
+   switch (sctx->framebuffer.state.zsbuf->texture->format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]);
+      break;
+   default: /* 24-bit */
+      si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]);
+      break;
+   case PIPE_FORMAT_Z32_FLOAT:
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]);
+      break;
+   }
  }
  
  /*
@@ -864,245 +807,228 @@ static void si_update_poly_offset_state(struct si_context *sctx)
  
  static uint32_t si_translate_fill(uint32_t func)
  {
-       switch(func) {
-       case PIPE_POLYGON_MODE_FILL:
-               return V_028814_X_DRAW_TRIANGLES;
-       case PIPE_POLYGON_MODE_LINE:
-               return V_028814_X_DRAW_LINES;
-       case PIPE_POLYGON_MODE_POINT:
-               return V_028814_X_DRAW_POINTS;
-       default:
-               assert(0);
-               return V_028814_X_DRAW_POINTS;
-       }
+   switch (func) {
+   case PIPE_POLYGON_MODE_FILL:
+      return V_028814_X_DRAW_TRIANGLES;
+   case PIPE_POLYGON_MODE_LINE:
+      return V_028814_X_DRAW_LINES;
+   case PIPE_POLYGON_MODE_POINT:
+      return V_028814_X_DRAW_POINTS;
+   default:
+      assert(0);
+      return V_028814_X_DRAW_POINTS;
+   }
  }
  
-static void *si_create_rs_state(struct pipe_context *ctx,
-                               const struct pipe_rasterizer_state *state)
+static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rasterizer_state *state)
  {
-       struct si_screen *sscreen = ((struct si_context *)ctx)->screen;
-       struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer);
-       struct si_pm4_state *pm4 = &rs->pm4;
-       unsigned tmp, i;
-       float psize_min, psize_max;
-
-       if (!rs) {
-               return NULL;
-       }
-
-       if (!state->front_ccw) {
-               rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
-               rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
-       } else {
-               rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
-               rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
-       }
-       rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
-       rs->provoking_vertex_first = state->flatshade_first;
-       rs->scissor_enable = state->scissor;
-       rs->clip_halfz = state->clip_halfz;
-       rs->two_side = state->light_twoside;
-       rs->multisample_enable = state->multisample;
-       rs->force_persample_interp = state->force_persample_interp;
-       rs->clip_plane_enable = state->clip_plane_enable;
-       rs->half_pixel_center = state->half_pixel_center;
-       rs->line_stipple_enable = state->line_stipple_enable;
-       rs->poly_stipple_enable = state->poly_stipple_enable;
-       rs->line_smooth = state->line_smooth;
-       rs->line_width = state->line_width;
-       rs->poly_smooth = state->poly_smooth;
-       rs->uses_poly_offset = state->offset_point || state->offset_line ||
-                              state->offset_tri;
-       rs->clamp_fragment_color = state->clamp_fragment_color;
-       rs->clamp_vertex_color = state->clamp_vertex_color;
-       rs->flatshade = state->flatshade;
-       rs->flatshade_first = state->flatshade_first;
-       rs->sprite_coord_enable = state->sprite_coord_enable;
-       rs->rasterizer_discard = state->rasterizer_discard;
-       rs->polygon_mode_enabled = (state->fill_front != PIPE_POLYGON_MODE_FILL &&
-                                   !(state->cull_face & PIPE_FACE_FRONT)) ||
-                                  (state->fill_back != PIPE_POLYGON_MODE_FILL &&
-                                   !(state->cull_face & PIPE_FACE_BACK));
-       rs->polygon_mode_is_lines = (state->fill_front == PIPE_POLYGON_MODE_LINE &&
-                                    !(state->cull_face & PIPE_FACE_FRONT)) ||
-                                   (state->fill_back == PIPE_POLYGON_MODE_LINE &&
-                                    !(state->cull_face & PIPE_FACE_BACK));
-       rs->pa_sc_line_stipple = state->line_stipple_enable ?
-                               S_028A0C_LINE_PATTERN(state->line_stipple_pattern) |
-                               S_028A0C_REPEAT_COUNT(state->line_stipple_factor) : 0;
-       rs->pa_cl_clip_cntl =
-               S_028810_DX_CLIP_SPACE_DEF(state->clip_halfz) |
-               S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip_near) |
-               S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip_far) |
-               S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) |
-               S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
-
-       si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0,
-               S_0286D4_FLAT_SHADE_ENA(1) |
-               S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) |
-               S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
-               S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
-               S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
-               S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) |
-               S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT));
-
-       /* point size 12.4 fixed point */
-       tmp = (unsigned)(state->point_size * 8.0);
-       si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp));
-
-       if (state->point_size_per_vertex) {
-               psize_min = util_get_min_point_size(state);
-               psize_max = SI_MAX_POINT_SIZE;
-       } else {
-               /* Force the point size to be as if the vertex output was disabled. */
-               psize_min = state->point_size;
-               psize_max = state->point_size;
-       }
-       rs->max_point_size = psize_max;
-
-       /* Divide by two, because 0.5 = 1 pixel. */
-       si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX,
-                       S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min/2)) |
-                       S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max/2)));
-
-       si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL,
-                      S_028A08_WIDTH(si_pack_float_12p4(state->line_width/2)));
-       si_pm4_set_reg(pm4, R_028A48_PA_SC_MODE_CNTL_0,
-                      S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) |
-                      S_028A48_MSAA_ENABLE(state->multisample ||
-                                           state->poly_smooth ||
-                                           state->line_smooth) |
-                      S_028A48_VPORT_SCISSOR_ENABLE(1) |
-                      S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9));
-
-       si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
-       si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL,
-               S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
-               S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
-               S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
-               S_028814_FACE(!state->front_ccw) |
-               S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
-               S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
-               S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
-               S_028814_POLY_MODE(rs->polygon_mode_enabled) |
-               S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
-               S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)));
-
-       if (!rs->uses_poly_offset)
-               return rs;
-
-       rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state));
-       if (!rs->pm4_poly_offset) {
-               FREE(rs);
-               return NULL;
-       }
-
-       /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */
-       for (i = 0; i < 3; i++) {
-               struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i];
-               float offset_units = state->offset_units;
-               float offset_scale = state->offset_scale * 16.0f;
-               uint32_t pa_su_poly_offset_db_fmt_cntl = 0;
-
-               if (!state->offset_units_unscaled) {
-                       switch (i) {
-                       case 0: /* 16-bit zbuffer */
-                               offset_units *= 4.0f;
-                               pa_su_poly_offset_db_fmt_cntl =
-                                       S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
-                               break;
-                       case 1: /* 24-bit zbuffer */
-                               offset_units *= 2.0f;
-                               pa_su_poly_offset_db_fmt_cntl =
-                                       S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
-                               break;
-                       case 2: /* 32-bit zbuffer */
-                               offset_units *= 1.0f;
-                               pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) |
-                                                               S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
-                               break;
-                       }
-               }
-
-               si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE,
-                              fui(offset_scale));
-               si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET,
-                              fui(offset_units));
-               si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE,
-                              fui(offset_scale));
-               si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET,
-                              fui(offset_units));
-               si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
-                              pa_su_poly_offset_db_fmt_cntl);
-       }
-
-       return rs;
+   struct si_screen *sscreen = ((struct si_context *)ctx)->screen;
+   struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer);
+   struct si_pm4_state *pm4 = &rs->pm4;
+   unsigned tmp, i;
+   float psize_min, psize_max;
+
+   if (!rs) {
+      return NULL;
+   }
+
+   if (!state->front_ccw) {
+      rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
+      rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
+   } else {
+      rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
+      rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
+   }
+   rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
+   rs->provoking_vertex_first = state->flatshade_first;
+   rs->scissor_enable = state->scissor;
+   rs->clip_halfz = state->clip_halfz;
+   rs->two_side = state->light_twoside;
+   rs->multisample_enable = state->multisample;
+   rs->force_persample_interp = state->force_persample_interp;
+   rs->clip_plane_enable = state->clip_plane_enable;
+   rs->half_pixel_center = state->half_pixel_center;
+   rs->line_stipple_enable = state->line_stipple_enable;
+   rs->poly_stipple_enable = state->poly_stipple_enable;
+   rs->line_smooth = state->line_smooth;
+   rs->line_width = state->line_width;
+   rs->poly_smooth = state->poly_smooth;
+   rs->uses_poly_offset = state->offset_point || state->offset_line || state->offset_tri;
+   rs->clamp_fragment_color = state->clamp_fragment_color;
+   rs->clamp_vertex_color = state->clamp_vertex_color;
+   rs->flatshade = state->flatshade;
+   rs->flatshade_first = state->flatshade_first;
+   rs->sprite_coord_enable = state->sprite_coord_enable;
+   rs->rasterizer_discard = state->rasterizer_discard;
+   rs->polygon_mode_enabled =
+      (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) ||
+      (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK));
+   rs->polygon_mode_is_lines =
+      (state->fill_front == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_FRONT)) ||
+      (state->fill_back == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_BACK));
+   rs->pa_sc_line_stipple = state->line_stipple_enable
+                               ? S_028A0C_LINE_PATTERN(state->line_stipple_pattern) |
+                                    S_028A0C_REPEAT_COUNT(state->line_stipple_factor)
+                               : 0;
+   rs->pa_cl_clip_cntl = S_028810_DX_CLIP_SPACE_DEF(state->clip_halfz) |
+                         S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip_near) |
+                         S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip_far) |
+                         S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) |
+                         S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
+
+   si_pm4_set_reg(
+      pm4, R_0286D4_SPI_INTERP_CONTROL_0,
+      S_0286D4_FLAT_SHADE_ENA(1) | S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) |
+         S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
+         S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
+         S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
+         S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) |
+         S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT));
+
+   /* point size 12.4 fixed point */
+   tmp = (unsigned)(state->point_size * 8.0);
+   si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp));
+
+   if (state->point_size_per_vertex) {
+      psize_min = util_get_min_point_size(state);
+      psize_max = SI_MAX_POINT_SIZE;
+   } else {
+      /* Force the point size to be as if the vertex output was disabled. */
+      psize_min = state->point_size;
+      psize_max = state->point_size;
+   }
+   rs->max_point_size = psize_max;
+
+   /* Divide by two, because 0.5 = 1 pixel. */
+   si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX,
+                  S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min / 2)) |
+                     S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max / 2)));
+
+   si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL,
+                  S_028A08_WIDTH(si_pack_float_12p4(state->line_width / 2)));
+   si_pm4_set_reg(
+      pm4, R_028A48_PA_SC_MODE_CNTL_0,
+      S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) |
+         S_028A48_MSAA_ENABLE(state->multisample || state->poly_smooth || state->line_smooth) |
+         S_028A48_VPORT_SCISSOR_ENABLE(1) |
+         S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9));
+
+   si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
+   si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL,
+                  S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
+                     S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
+                     S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
+                     S_028814_FACE(!state->front_ccw) |
+                     S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
+                     S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
+                     S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
+                     S_028814_POLY_MODE(rs->polygon_mode_enabled) |
+                     S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
+                     S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)));
+
+   if (!rs->uses_poly_offset)
+      return rs;
+
+   rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state));
+   if (!rs->pm4_poly_offset) {
+      FREE(rs);
+      return NULL;
+   }
+
+   /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */
+   for (i = 0; i < 3; i++) {
+      struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i];
+      float offset_units = state->offset_units;
+      float offset_scale = state->offset_scale * 16.0f;
+      uint32_t pa_su_poly_offset_db_fmt_cntl = 0;
+
+      if (!state->offset_units_unscaled) {
+         switch (i) {
+         case 0: /* 16-bit zbuffer */
+            offset_units *= 4.0f;
+            pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
+            break;
+         case 1: /* 24-bit zbuffer */
+            offset_units *= 2.0f;
+            pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
+            break;
+         case 2: /* 32-bit zbuffer */
+            offset_units *= 1.0f;
+            pa_su_poly_offset_db_fmt_cntl =
+               S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
+            break;
+         }
+      }
+
+      si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, fui(offset_scale));
+      si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, fui(offset_units));
+      si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, fui(offset_scale));
+      si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, fui(offset_units));
+      si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
+   }
+
+   return rs;
  }
  
  static void si_bind_rs_state(struct pipe_context *ctx, void *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_state_rasterizer *old_rs =
-               (struct si_state_rasterizer*)sctx->queued.named.rasterizer;
-       struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
-
-       if (!rs)
-               rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state;
-
-       if (old_rs->multisample_enable != rs->multisample_enable) {
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-
-               /* Update the small primitive filter workaround if necessary. */
-               if (sctx->screen->info.has_msaa_sample_loc_bug &&
-                   sctx->framebuffer.nr_samples > 1)
-                       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
-       }
-
-       sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
-       sctx->current_vs_state |= S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color);
-
-       si_pm4_bind_state(sctx, rasterizer, rs);
-       si_update_poly_offset_state(sctx);
-
-       if (old_rs->scissor_enable != rs->scissor_enable)
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
-
-       if (old_rs->line_width != rs->line_width ||
-           old_rs->max_point_size != rs->max_point_size ||
-           old_rs->half_pixel_center != rs->half_pixel_center)
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
-
-       if (old_rs->clip_halfz != rs->clip_halfz)
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports);
-
-       if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
-           old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
-
-       if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
-           old_rs->rasterizer_discard != rs->rasterizer_discard ||
-           old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
-           old_rs->flatshade != rs->flatshade ||
-           old_rs->two_side != rs->two_side ||
-           old_rs->multisample_enable != rs->multisample_enable ||
-           old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
-           old_rs->poly_smooth != rs->poly_smooth ||
-           old_rs->line_smooth != rs->line_smooth ||
-           old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
-           old_rs->force_persample_interp != rs->force_persample_interp)
-               sctx->do_update_shaders = true;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_state_rasterizer *old_rs = (struct si_state_rasterizer *)sctx->queued.named.rasterizer;
+   struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
+
+   if (!rs)
+      rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state;
+
+   if (old_rs->multisample_enable != rs->multisample_enable) {
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+
+      /* Update the small primitive filter workaround if necessary. */
+      if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1)
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
+   }
+
+   sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
+   sctx->current_vs_state |= S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color);
+
+   si_pm4_bind_state(sctx, rasterizer, rs);
+   si_update_poly_offset_state(sctx);
+
+   if (old_rs->scissor_enable != rs->scissor_enable)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
+
+   if (old_rs->line_width != rs->line_width || old_rs->max_point_size != rs->max_point_size ||
+       old_rs->half_pixel_center != rs->half_pixel_center)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
+
+   if (old_rs->clip_halfz != rs->clip_halfz)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports);
+
+   if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
+       old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
+
+   if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
+       old_rs->rasterizer_discard != rs->rasterizer_discard ||
+       old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
+       old_rs->flatshade != rs->flatshade || old_rs->two_side != rs->two_side ||
+       old_rs->multisample_enable != rs->multisample_enable ||
+       old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
+       old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth ||
+       old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
+       old_rs->force_persample_interp != rs->force_persample_interp)
+      sctx->do_update_shaders = true;
  }
  
  static void si_delete_rs_state(struct pipe_context *ctx, void *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
  
-       if (sctx->queued.named.rasterizer == state)
-               si_bind_rs_state(ctx, sctx->discard_rasterizer_state);
+   if (sctx->queued.named.rasterizer == state)
+      si_bind_rs_state(ctx, sctx->discard_rasterizer_state);
  
-       FREE(rs->pm4_poly_offset);
-       si_pm4_delete_state(sctx, rasterizer, rs);
+   FREE(rs->pm4_poly_offset);
+   si_pm4_delete_state(sctx, rasterizer, rs);
  }
  
  /*
@@ -1110,81 +1036,75 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state)
   */
  static void si_emit_stencil_ref(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
-       struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
-
-       radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
-       radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) |
-                       S_028430_STENCILMASK(dsa->valuemask[0]) |
-                       S_028430_STENCILWRITEMASK(dsa->writemask[0]) |
-                       S_028430_STENCILOPVAL(1));
-       radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) |
-                       S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
-                       S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
-                       S_028434_STENCILOPVAL_BF(1));
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
+   struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
+
+   radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
+   radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) |
+                      S_028430_STENCILMASK(dsa->valuemask[0]) |
+                      S_028430_STENCILWRITEMASK(dsa->writemask[0]) | S_028430_STENCILOPVAL(1));
+   radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) |
+                      S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
+                      S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
+                      S_028434_STENCILOPVAL_BF(1));
  }
  
-static void si_set_stencil_ref(struct pipe_context *ctx,
-                              const struct pipe_stencil_ref *state)
+static void si_set_stencil_ref(struct pipe_context *ctx, const struct pipe_stencil_ref *state)
  {
-        struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
  
-       if (memcmp(&sctx->stencil_ref.state, state, sizeof(*state)) == 0)
-               return;
+   if (memcmp(&sctx->stencil_ref.state, state, sizeof(*state)) == 0)
+      return;
  
-       sctx->stencil_ref.state = *state;
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
+   sctx->stencil_ref.state = *state;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
  }
  
-
  /*
   * DSA
   */
  
  static uint32_t si_translate_stencil_op(int s_op)
  {
-       switch (s_op) {
-       case PIPE_STENCIL_OP_KEEP:
-               return V_02842C_STENCIL_KEEP;
-       case PIPE_STENCIL_OP_ZERO:
-               return V_02842C_STENCIL_ZERO;
-       case PIPE_STENCIL_OP_REPLACE:
-               return V_02842C_STENCIL_REPLACE_TEST;
-       case PIPE_STENCIL_OP_INCR:
-               return V_02842C_STENCIL_ADD_CLAMP;
-       case PIPE_STENCIL_OP_DECR:
-               return V_02842C_STENCIL_SUB_CLAMP;
-       case PIPE_STENCIL_OP_INCR_WRAP:
-               return V_02842C_STENCIL_ADD_WRAP;
-       case PIPE_STENCIL_OP_DECR_WRAP:
-               return V_02842C_STENCIL_SUB_WRAP;
-       case PIPE_STENCIL_OP_INVERT:
-               return V_02842C_STENCIL_INVERT;
-       default:
-               PRINT_ERR("Unknown stencil op %d", s_op);
-               assert(0);
-               break;
-       }
-       return 0;
+   switch (s_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      return V_02842C_STENCIL_KEEP;
+   case PIPE_STENCIL_OP_ZERO:
+      return V_02842C_STENCIL_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:
+      return V_02842C_STENCIL_REPLACE_TEST;
+   case PIPE_STENCIL_OP_INCR:
+      return V_02842C_STENCIL_ADD_CLAMP;
+   case PIPE_STENCIL_OP_DECR:
+      return V_02842C_STENCIL_SUB_CLAMP;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      return V_02842C_STENCIL_ADD_WRAP;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      return V_02842C_STENCIL_SUB_WRAP;
+   case PIPE_STENCIL_OP_INVERT:
+      return V_02842C_STENCIL_INVERT;
+   default:
+      PRINT_ERR("Unknown stencil op %d", s_op);
+      assert(0);
+      break;
+   }
+   return 0;
  }
  
  static bool si_dsa_writes_stencil(const struct pipe_stencil_state *s)
  {
-       return s->enabled && s->writemask &&
-              (s->fail_op  != PIPE_STENCIL_OP_KEEP ||
-               s->zfail_op != PIPE_STENCIL_OP_KEEP ||
-               s->zpass_op != PIPE_STENCIL_OP_KEEP);
+   return s->enabled && s->writemask &&
+          (s->fail_op != PIPE_STENCIL_OP_KEEP || s->zfail_op != PIPE_STENCIL_OP_KEEP ||
+           s->zpass_op != PIPE_STENCIL_OP_KEEP);
  }
  
  static bool si_order_invariant_stencil_op(enum pipe_stencil_op op)
  {
-       /* REPLACE is normally order invariant, except when the stencil
-        * reference value is written by the fragment shader. Tracking this
-        * interaction does not seem worth the effort, so be conservative. */
-       return op != PIPE_STENCIL_OP_INCR &&
-              op != PIPE_STENCIL_OP_DECR &&
-              op != PIPE_STENCIL_OP_REPLACE;
+   /* REPLACE is normally order invariant, except when the stencil
+    * reference value is written by the fragment shader. Tracking this
+    * interaction does not seem worth the effort, so be conservative. */
+   return op != PIPE_STENCIL_OP_INCR && op != PIPE_STENCIL_OP_DECR && op != PIPE_STENCIL_OP_REPLACE;
  }
  
  /* Compute whether, assuming Z writes are disabled, this stencil state is order
@@ -1192,325 +1112,304 @@ static bool si_order_invariant_stencil_op(enum pipe_stencil_op op)
   * final stencil buffer result does not depend on the order of fragments. */
  static bool si_order_invariant_stencil_state(const struct pipe_stencil_state *state)
  {
-       return !state->enabled || !state->writemask ||
-              /* The following assumes that Z writes are disabled. */
-              (state->func == PIPE_FUNC_ALWAYS &&
-               si_order_invariant_stencil_op(state->zpass_op) &&
-               si_order_invariant_stencil_op(state->zfail_op)) ||
-              (state->func == PIPE_FUNC_NEVER &&
-               si_order_invariant_stencil_op(state->fail_op));
+   return !state->enabled || !state->writemask ||
+          /* The following assumes that Z writes are disabled. */
+          (state->func == PIPE_FUNC_ALWAYS && si_order_invariant_stencil_op(state->zpass_op) &&
+           si_order_invariant_stencil_op(state->zfail_op)) ||
+          (state->func == PIPE_FUNC_NEVER && si_order_invariant_stencil_op(state->fail_op));
  }
  
  static void *si_create_dsa_state(struct pipe_context *ctx,
-                                const struct pipe_depth_stencil_alpha_state *state)
+                                 const struct pipe_depth_stencil_alpha_state *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa);
-       struct si_pm4_state *pm4 = &dsa->pm4;
-       unsigned db_depth_control;
-       uint32_t db_stencil_control = 0;
-
-       if (!dsa) {
-               return NULL;
-       }
-
-       dsa->stencil_ref.valuemask[0] = state->stencil[0].valuemask;
-       dsa->stencil_ref.valuemask[1] = state->stencil[1].valuemask;
-       dsa->stencil_ref.writemask[0] = state->stencil[0].writemask;
-       dsa->stencil_ref.writemask[1] = state->stencil[1].writemask;
-
-       db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) |
-               S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
-               S_028800_ZFUNC(state->depth.func) |
-               S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test);
-
-       /* stencil */
-       if (state->stencil[0].enabled) {
-               db_depth_control |= S_028800_STENCIL_ENABLE(1);
-               db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func);
-               db_stencil_control |= S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op));
-               db_stencil_control |= S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op));
-               db_stencil_control |= S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op));
-
-               if (state->stencil[1].enabled) {
-                       db_depth_control |= S_028800_BACKFACE_ENABLE(1);
-                       db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func);
-                       db_stencil_control |= S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op));
-                       db_stencil_control |= S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op));
-                       db_stencil_control |= S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op));
-               }
-       }
-
-       /* alpha */
-       if (state->alpha.enabled) {
-               dsa->alpha_func = state->alpha.func;
-
-               si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 +
-                              SI_SGPR_ALPHA_REF * 4, fui(state->alpha.ref_value));
-       } else {
-               dsa->alpha_func = PIPE_FUNC_ALWAYS;
-       }
-
-       si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control);
-       if (state->stencil[0].enabled)
-               si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
-       if (state->depth.bounds_test) {
-               si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min));
-               si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max));
-       }
-
-       dsa->depth_enabled = state->depth.enabled;
-       dsa->depth_write_enabled = state->depth.enabled &&
-                                  state->depth.writemask;
-       dsa->stencil_enabled = state->stencil[0].enabled;
-       dsa->stencil_write_enabled = state->stencil[0].enabled &&
-                                    (si_dsa_writes_stencil(&state->stencil[0]) ||
-                                     si_dsa_writes_stencil(&state->stencil[1]));
-       dsa->db_can_write = dsa->depth_write_enabled ||
-                           dsa->stencil_write_enabled;
-
-       bool zfunc_is_ordered =
-               state->depth.func == PIPE_FUNC_NEVER ||
-               state->depth.func == PIPE_FUNC_LESS ||
-               state->depth.func == PIPE_FUNC_LEQUAL ||
-               state->depth.func == PIPE_FUNC_GREATER ||
-               state->depth.func == PIPE_FUNC_GEQUAL;
-
-       bool nozwrite_and_order_invariant_stencil =
-               !dsa->db_can_write ||
-               (!dsa->depth_write_enabled &&
-                si_order_invariant_stencil_state(&state->stencil[0]) &&
-                si_order_invariant_stencil_state(&state->stencil[1]));
-
-       dsa->order_invariance[1].zs =
-               nozwrite_and_order_invariant_stencil ||
-               (!dsa->stencil_write_enabled && zfunc_is_ordered);
-       dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered;
-
-       dsa->order_invariance[1].pass_set =
-               nozwrite_and_order_invariant_stencil ||
-               (!dsa->stencil_write_enabled &&
-                (state->depth.func == PIPE_FUNC_ALWAYS ||
-                 state->depth.func == PIPE_FUNC_NEVER));
-       dsa->order_invariance[0].pass_set =
-               !dsa->depth_write_enabled ||
-               (state->depth.func == PIPE_FUNC_ALWAYS ||
-                state->depth.func == PIPE_FUNC_NEVER);
-
-       dsa->order_invariance[1].pass_last =
-               sctx->screen->assume_no_z_fights &&
-               !dsa->stencil_write_enabled &&
-               dsa->depth_write_enabled && zfunc_is_ordered;
-       dsa->order_invariance[0].pass_last =
-               sctx->screen->assume_no_z_fights &&
-               dsa->depth_write_enabled && zfunc_is_ordered;
-
-       return dsa;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa);
+   struct si_pm4_state *pm4 = &dsa->pm4;
+   unsigned db_depth_control;
+   uint32_t db_stencil_control = 0;
+
+   if (!dsa) {
+      return NULL;
+   }
+
+   dsa->stencil_ref.valuemask[0] = state->stencil[0].valuemask;
+   dsa->stencil_ref.valuemask[1] = state->stencil[1].valuemask;
+   dsa->stencil_ref.writemask[0] = state->stencil[0].writemask;
+   dsa->stencil_ref.writemask[1] = state->stencil[1].writemask;
+
+   db_depth_control =
+      S_028800_Z_ENABLE(state->depth.enabled) | S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
+      S_028800_ZFUNC(state->depth.func) | S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test);
+
+   /* stencil */
+   if (state->stencil[0].enabled) {
+      db_depth_control |= S_028800_STENCIL_ENABLE(1);
+      db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func);
+      db_stencil_control |=
+         S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op));
+      db_stencil_control |=
+         S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op));
+      db_stencil_control |=
+         S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op));
+
+      if (state->stencil[1].enabled) {
+         db_depth_control |= S_028800_BACKFACE_ENABLE(1);
+         db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func);
+         db_stencil_control |=
+            S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op));
+         db_stencil_control |=
+            S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op));
+         db_stencil_control |=
+            S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op));
+      }
+   }
+
+   /* alpha */
+   if (state->alpha.enabled) {
+      dsa->alpha_func = state->alpha.func;
+
+      si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4,
+                     fui(state->alpha.ref_value));
+   } else {
+      dsa->alpha_func = PIPE_FUNC_ALWAYS;
+   }
+
+   si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control);
+   if (state->stencil[0].enabled)
+      si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
+   if (state->depth.bounds_test) {
+      si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min));
+      si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max));
+   }
+
+   dsa->depth_enabled = state->depth.enabled;
+   dsa->depth_write_enabled = state->depth.enabled && state->depth.writemask;
+   dsa->stencil_enabled = state->stencil[0].enabled;
+   dsa->stencil_write_enabled =
+      state->stencil[0].enabled &&
+      (si_dsa_writes_stencil(&state->stencil[0]) || si_dsa_writes_stencil(&state->stencil[1]));
+   dsa->db_can_write = dsa->depth_write_enabled || dsa->stencil_write_enabled;
+
+   bool zfunc_is_ordered =
+      state->depth.func == PIPE_FUNC_NEVER || state->depth.func == PIPE_FUNC_LESS ||
+      state->depth.func == PIPE_FUNC_LEQUAL || state->depth.func == PIPE_FUNC_GREATER ||
+      state->depth.func == PIPE_FUNC_GEQUAL;
+
+   bool nozwrite_and_order_invariant_stencil =
+      !dsa->db_can_write ||
+      (!dsa->depth_write_enabled && si_order_invariant_stencil_state(&state->stencil[0]) &&
+       si_order_invariant_stencil_state(&state->stencil[1]));
+
+   dsa->order_invariance[1].zs =
+      nozwrite_and_order_invariant_stencil || (!dsa->stencil_write_enabled && zfunc_is_ordered);
+   dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered;
+
+   dsa->order_invariance[1].pass_set =
+      nozwrite_and_order_invariant_stencil ||
+      (!dsa->stencil_write_enabled &&
+       (state->depth.func == PIPE_FUNC_ALWAYS || state->depth.func == PIPE_FUNC_NEVER));
+   dsa->order_invariance[0].pass_set =
+      !dsa->depth_write_enabled ||
+      (state->depth.func == PIPE_FUNC_ALWAYS || state->depth.func == PIPE_FUNC_NEVER);
+
+   dsa->order_invariance[1].pass_last = sctx->screen->assume_no_z_fights &&
+                                        !dsa->stencil_write_enabled && dsa->depth_write_enabled &&
+                                        zfunc_is_ordered;
+   dsa->order_invariance[0].pass_last =
+      sctx->screen->assume_no_z_fights && dsa->depth_write_enabled && zfunc_is_ordered;
+
+   return dsa;
  }
  
  static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
  {
-        struct si_context *sctx = (struct si_context *)ctx;
-       struct si_state_dsa *old_dsa = sctx->queued.named.dsa;
-        struct si_state_dsa *dsa = state;
-
-        if (!dsa)
-                dsa = (struct si_state_dsa *)sctx->noop_dsa;
-
-       si_pm4_bind_state(sctx, dsa, dsa);
-
-       if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part,
-                  sizeof(struct si_dsa_stencil_ref_part)) != 0) {
-               sctx->stencil_ref.dsa_part = dsa->stencil_ref;
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
-       }
-
-       if (old_dsa->alpha_func != dsa->alpha_func)
-               sctx->do_update_shaders = true;
-
-       if (sctx->screen->dpbb_allowed &&
-           ((old_dsa->depth_enabled != dsa->depth_enabled ||
-             old_dsa->stencil_enabled != dsa->stencil_enabled ||
-             old_dsa->db_can_write != dsa->db_can_write)))
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-
-       if (sctx->screen->has_out_of_order_rast &&
-           (memcmp(old_dsa->order_invariance, dsa->order_invariance,
-                   sizeof(old_dsa->order_invariance))))
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_state_dsa *old_dsa = sctx->queued.named.dsa;
+   struct si_state_dsa *dsa = state;
+
+   if (!dsa)
+      dsa = (struct si_state_dsa *)sctx->noop_dsa;
+
+   si_pm4_bind_state(sctx, dsa, dsa);
+
+   if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part,
+              sizeof(struct si_dsa_stencil_ref_part)) != 0) {
+      sctx->stencil_ref.dsa_part = dsa->stencil_ref;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
+   }
+
+   if (old_dsa->alpha_func != dsa->alpha_func)
+      sctx->do_update_shaders = true;
+
+   if (sctx->screen->dpbb_allowed && ((old_dsa->depth_enabled != dsa->depth_enabled ||
+                                       old_dsa->stencil_enabled != dsa->stencil_enabled ||
+                                       old_dsa->db_can_write != dsa->db_can_write)))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+
+   if (sctx->screen->has_out_of_order_rast &&
+       (memcmp(old_dsa->order_invariance, dsa->order_invariance,
+               sizeof(old_dsa->order_invariance))))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
  }
  
  static void si_delete_dsa_state(struct pipe_context *ctx, void *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
  
-       if (sctx->queued.named.dsa == state)
-               si_bind_dsa_state(ctx, sctx->noop_dsa);
+   if (sctx->queued.named.dsa == state)
+      si_bind_dsa_state(ctx, sctx->noop_dsa);
  
-       si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state);
+   si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state);
  }
  
  static void *si_create_db_flush_dsa(struct si_context *sctx)
  {
-       struct pipe_depth_stencil_alpha_state dsa = {};
+   struct pipe_depth_stencil_alpha_state dsa = {};
  
-       return sctx->b.create_depth_stencil_alpha_state(&sctx->b, &dsa);
+   return sctx->b.create_depth_stencil_alpha_state(&sctx->b, &dsa);
  }
  
  /* DB RENDER STATE */
  
  static void si_set_active_query_state(struct pipe_context *ctx, bool enable)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-
-       /* Pipeline stat & streamout queries. */
-       if (enable) {
-               sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
-               sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
-       } else {
-               sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
-               sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
-       }
-
-       /* Occlusion queries. */
-       if (sctx->occlusion_queries_disabled != !enable) {
-               sctx->occlusion_queries_disabled = !enable;
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-       }
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   /* Pipeline stat & streamout queries. */
+   if (enable) {
+      sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
+      sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
+   } else {
+      sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
+      sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
+   }
+
+   /* Occlusion queries. */
+   if (sctx->occlusion_queries_disabled != !enable) {
+      sctx->occlusion_queries_disabled = !enable;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   }
  }
  
-void si_set_occlusion_query_state(struct si_context *sctx,
-                                 bool old_perfect_enable)
+void si_set_occlusion_query_state(struct si_context *sctx, bool old_perfect_enable)
  {
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
  
-       bool perfect_enable = sctx->num_perfect_occlusion_queries != 0;
+   bool perfect_enable = sctx->num_perfect_occlusion_queries != 0;
  
-       if (perfect_enable != old_perfect_enable)
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+   if (perfect_enable != old_perfect_enable)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
  }
  
  void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
  {
-       st->saved_compute = sctx->cs_shader_state.program;
+   st->saved_compute = sctx->cs_shader_state.program;
  
-       si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
-       si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
+   si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
+   si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
  
-       st->saved_ssbo_writable_mask = 0;
+   st->saved_ssbo_writable_mask = 0;
  
-       for (unsigned i = 0; i < 3; i++) {
-               if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
-                   (1u << si_get_shaderbuf_slot(i)))
-                       st->saved_ssbo_writable_mask |= 1 << i;
-       }
+   for (unsigned i = 0; i < 3; i++) {
+      if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+          (1u << si_get_shaderbuf_slot(i)))
+         st->saved_ssbo_writable_mask |= 1 << i;
+   }
  }
  
  void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
  {
-       sctx->b.bind_compute_state(&sctx->b, st->saved_compute);
+   sctx->b.bind_compute_state(&sctx->b, st->saved_compute);
  
-       sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
-       pipe_resource_reference(&st->saved_const0.buffer, NULL);
+   sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
+   pipe_resource_reference(&st->saved_const0.buffer, NULL);
  
-       sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo,
-                                  st->saved_ssbo_writable_mask);
-       for (unsigned i = 0; i < 3; ++i)
-               pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
+   sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo,
+                              st->saved_ssbo_writable_mask);
+   for (unsigned i = 0; i < 3; ++i)
+      pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
  }
  
  static void si_emit_db_render_state(struct si_context *sctx)
  {
-       struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-       unsigned db_shader_control, db_render_control, db_count_control;
-       unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
-       /* DB_RENDER_CONTROL */
-       if (sctx->dbcb_depth_copy_enabled ||
-           sctx->dbcb_stencil_copy_enabled) {
-               db_render_control =
-                       S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
-                       S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
-                       S_028000_COPY_CENTROID(1) |
-                       S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample);
-       } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) {
-               db_render_control =
-                       S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
-                       S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace);
-       } else {
-               db_render_control =
-                       S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
-                       S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear);
-       }
-
-       /* DB_COUNT_CONTROL (occlusion queries) */
-       if (sctx->num_occlusion_queries > 0 &&
-           !sctx->occlusion_queries_disabled) {
-               bool perfect = sctx->num_perfect_occlusion_queries > 0;
-               bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect;
-
-               if (sctx->chip_class >= GFX7) {
-                       unsigned log_sample_rate = sctx->framebuffer.log_samples;
-
-                       /* Stoney doesn't increment occlusion query counters
-                        * if the sample rate is 16x. Use 8x sample rate instead.
-                        */
-                       if (sctx->family == CHIP_STONEY)
-                               log_sample_rate = MIN2(log_sample_rate, 3);
-
-                       db_count_control =
-                               S_028004_PERFECT_ZPASS_COUNTS(perfect) |
-                               S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
-                               S_028004_SAMPLE_RATE(log_sample_rate) |
-                               S_028004_ZPASS_ENABLE(1) |
-                               S_028004_SLICE_EVEN_ENABLE(1) |
-                               S_028004_SLICE_ODD_ENABLE(1);
-               } else {
-                       db_count_control =
-                               S_028004_PERFECT_ZPASS_COUNTS(perfect) |
-                               S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples);
-               }
-       } else {
-               /* Disable occlusion queries. */
-               if (sctx->chip_class >= GFX7) {
-                       db_count_control = 0;
-               } else {
-                       db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
-               }
-       }
-
-       radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL,
-                                   SI_TRACKED_DB_RENDER_CONTROL, db_render_control,
-                                   db_count_control);
-
-       /* DB_RENDER_OVERRIDE2 */
-       radeon_opt_set_context_reg(sctx,  R_028010_DB_RENDER_OVERRIDE2,
-               SI_TRACKED_DB_RENDER_OVERRIDE2,
-               S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |
-               S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
-               S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4));
-
-       db_shader_control = sctx->ps_db_shader_control;
-
-       /* Bug workaround for smoothing (overrasterization) on GFX6. */
-       if (sctx->chip_class == GFX6 && sctx->smoothing_enabled) {
-               db_shader_control &= C_02880C_Z_ORDER;
-               db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z);
-       }
-
-       /* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */
-       if (!rs->multisample_enable)
-               db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;
-
-       if (sctx->screen->info.has_rbplus &&
-           !sctx->screen->info.rbplus_allowed)
-               db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
-
-       radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL,
-                                  SI_TRACKED_DB_SHADER_CONTROL, db_shader_control);
-
-       if (initial_cdw != sctx->gfx_cs->current.cdw)
-               sctx->context_roll = true;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   unsigned db_shader_control, db_render_control, db_count_control;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+   /* DB_RENDER_CONTROL */
+   if (sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled) {
+      db_render_control = S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
+                          S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
+                          S_028000_COPY_CENTROID(1) | S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample);
+   } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) {
+      db_render_control = S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
+                          S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace);
+   } else {
+      db_render_control = S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
+                          S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear);
+   }
+
+   /* DB_COUNT_CONTROL (occlusion queries) */
+   if (sctx->num_occlusion_queries > 0 && !sctx->occlusion_queries_disabled) {
+      bool perfect = sctx->num_perfect_occlusion_queries > 0;
+      bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect;
+
+      if (sctx->chip_class >= GFX7) {
+         unsigned log_sample_rate = sctx->framebuffer.log_samples;
+
+         /* Stoney doesn't increment occlusion query counters
+          * if the sample rate is 16x. Use 8x sample rate instead.
+          */
+         if (sctx->family == CHIP_STONEY)
+            log_sample_rate = MIN2(log_sample_rate, 3);
+
+         db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) |
+                            S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
+                            S_028004_SAMPLE_RATE(log_sample_rate) | S_028004_ZPASS_ENABLE(1) |
+                            S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
+      } else {
+         db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) |
+                            S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples);
+      }
+   } else {
+      /* Disable occlusion queries. */
+      if (sctx->chip_class >= GFX7) {
+         db_count_control = 0;
+      } else {
+         db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
+      }
+   }
+
+   radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL,
+                               db_render_control, db_count_control);
+
+   /* DB_RENDER_OVERRIDE2 */
+   radeon_opt_set_context_reg(
+      sctx, R_028010_DB_RENDER_OVERRIDE2, SI_TRACKED_DB_RENDER_OVERRIDE2,
+      S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |
+         S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
+         S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4));
+
+   db_shader_control = sctx->ps_db_shader_control;
+
+   /* Bug workaround for smoothing (overrasterization) on GFX6. */
+   if (sctx->chip_class == GFX6 && sctx->smoothing_enabled) {
+      db_shader_control &= C_02880C_Z_ORDER;
+      db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z);
+   }
+
+   /* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */
+   if (!rs->multisample_enable)
+      db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;
+
+   if (sctx->screen->info.has_rbplus && !sctx->screen->info.rbplus_allowed)
+      db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
+
+   radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL,
+                              db_shader_control);
+
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
  }
  
  /*
@@ -1518,514 +1417,500 @@ static void si_emit_db_render_state(struct si_context *sctx)
   */
  static uint32_t si_translate_colorformat(enum pipe_format format)
  {
-       const struct util_format_description *desc = util_format_description(format);
-       if (!desc)
-               return V_028C70_COLOR_INVALID;
-
-#define HAS_SIZE(x,y,z,w) \
-       (desc->channel[0].size == (x) && desc->channel[1].size == (y) && \
-         desc->channel[2].size == (z) && desc->channel[3].size == (w))
-
-       if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
-               return V_028C70_COLOR_10_11_11;
-
-       if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
-               return V_028C70_COLOR_INVALID;
-
-       /* hw cannot support mixed formats (except depth/stencil, since
-        * stencil is not written to). */
-       if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
-               return V_028C70_COLOR_INVALID;
-
-       switch (desc->nr_channels) {
-       case 1:
-               switch (desc->channel[0].size) {
-               case 8:
-                       return V_028C70_COLOR_8;
-               case 16:
-                       return V_028C70_COLOR_16;
-               case 32:
-                       return V_028C70_COLOR_32;
-               }
-               break;
-       case 2:
-               if (desc->channel[0].size == desc->channel[1].size) {
-                       switch (desc->channel[0].size) {
-                       case 8:
-                               return V_028C70_COLOR_8_8;
-                       case 16:
-                               return V_028C70_COLOR_16_16;
-                       case 32:
-                               return V_028C70_COLOR_32_32;
-                       }
-               } else if (HAS_SIZE(8,24,0,0)) {
-                       return V_028C70_COLOR_24_8;
-               } else if (HAS_SIZE(24,8,0,0)) {
-                       return V_028C70_COLOR_8_24;
-               }
-               break;
-       case 3:
-               if (HAS_SIZE(5,6,5,0)) {
-                       return V_028C70_COLOR_5_6_5;
-               } else if (HAS_SIZE(32,8,24,0)) {
-                       return V_028C70_COLOR_X24_8_32_FLOAT;
-               }
-               break;
-       case 4:
-               if (desc->channel[0].size == desc->channel[1].size &&
-                   desc->channel[0].size == desc->channel[2].size &&
-                   desc->channel[0].size == desc->channel[3].size) {
-                       switch (desc->channel[0].size) {
-                       case 4:
-                               return V_028C70_COLOR_4_4_4_4;
-                       case 8:
-                               return V_028C70_COLOR_8_8_8_8;
-                       case 16:
-                               return V_028C70_COLOR_16_16_16_16;
-                       case 32:
-                               return V_028C70_COLOR_32_32_32_32;
-                       }
-               } else if (HAS_SIZE(5,5,5,1)) {
-                       return V_028C70_COLOR_1_5_5_5;
-               } else if (HAS_SIZE(1,5,5,5)) {
-                       return V_028C70_COLOR_5_5_5_1;
-               } else if (HAS_SIZE(10,10,10,2)) {
-                       return V_028C70_COLOR_2_10_10_10;
-               }
-               break;
-       }
-       return V_028C70_COLOR_INVALID;
+   const struct util_format_description *desc = util_format_description(format);
+   if (!desc)
+      return V_028C70_COLOR_INVALID;
+
+#define HAS_SIZE(x, y, z, w)                                                                       \
+   (desc->channel[0].size == (x) && desc->channel[1].size == (y) &&                                \
+    desc->channel[2].size == (z) && desc->channel[3].size == (w))
+
+   if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
+      return V_028C70_COLOR_10_11_11;
+
+   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+      return V_028C70_COLOR_INVALID;
+
+   /* hw cannot support mixed formats (except depth/stencil, since
+    * stencil is not written to). */
+   if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+      return V_028C70_COLOR_INVALID;
+
+   switch (desc->nr_channels) {
+   case 1:
+      switch (desc->channel[0].size) {
+      case 8:
+         return V_028C70_COLOR_8;
+      case 16:
+         return V_028C70_COLOR_16;
+      case 32:
+         return V_028C70_COLOR_32;
+      }
+      break;
+   case 2:
+      if (desc->channel[0].size == desc->channel[1].size) {
+         switch (desc->channel[0].size) {
+         case 8:
+            return V_028C70_COLOR_8_8;
+         case 16:
+            return V_028C70_COLOR_16_16;
+         case 32:
+            return V_028C70_COLOR_32_32;
+         }
+      } else if (HAS_SIZE(8, 24, 0, 0)) {
+         return V_028C70_COLOR_24_8;
+      } else if (HAS_SIZE(24, 8, 0, 0)) {
+         return V_028C70_COLOR_8_24;
+      }
+      break;
+   case 3:
+      if (HAS_SIZE(5, 6, 5, 0)) {
+         return V_028C70_COLOR_5_6_5;
+      } else if (HAS_SIZE(32, 8, 24, 0)) {
+         return V_028C70_COLOR_X24_8_32_FLOAT;
+      }
+      break;
+   case 4:
+      if (desc->channel[0].size == desc->channel[1].size &&
+          desc->channel[0].size == desc->channel[2].size &&
+          desc->channel[0].size == desc->channel[3].size) {
+         switch (desc->channel[0].size) {
+         case 4:
+            return V_028C70_COLOR_4_4_4_4;
+         case 8:
+            return V_028C70_COLOR_8_8_8_8;
+         case 16:
+            return V_028C70_COLOR_16_16_16_16;
+         case 32:
+            return V_028C70_COLOR_32_32_32_32;
+         }
+      } else if (HAS_SIZE(5, 5, 5, 1)) {
+         return V_028C70_COLOR_1_5_5_5;
+      } else if (HAS_SIZE(1, 5, 5, 5)) {
+         return V_028C70_COLOR_5_5_5_1;
+      } else if (HAS_SIZE(10, 10, 10, 2)) {
+         return V_028C70_COLOR_2_10_10_10;
+      }
+      break;
+   }
+   return V_028C70_COLOR_INVALID;
  }
  
  static uint32_t si_colorformat_endian_swap(uint32_t colorformat)
  {
-       if (SI_BIG_ENDIAN) {
-               switch(colorformat) {
-               /* 8-bit buffers. */
-               case V_028C70_COLOR_8:
-                       return V_028C70_ENDIAN_NONE;
-
-               /* 16-bit buffers. */
-               case V_028C70_COLOR_5_6_5:
-               case V_028C70_COLOR_1_5_5_5:
-               case V_028C70_COLOR_4_4_4_4:
-               case V_028C70_COLOR_16:
-               case V_028C70_COLOR_8_8:
-                       return V_028C70_ENDIAN_8IN16;
-
-               /* 32-bit buffers. */
-               case V_028C70_COLOR_8_8_8_8:
-               case V_028C70_COLOR_2_10_10_10:
-               case V_028C70_COLOR_8_24:
-               case V_028C70_COLOR_24_8:
-               case V_028C70_COLOR_16_16:
-                       return V_028C70_ENDIAN_8IN32;
-
-               /* 64-bit buffers. */
-               case V_028C70_COLOR_16_16_16_16:
-                       return V_028C70_ENDIAN_8IN16;
-
-               case V_028C70_COLOR_32_32:
-                       return V_028C70_ENDIAN_8IN32;
-
-               /* 128-bit buffers. */
-               case V_028C70_COLOR_32_32_32_32:
-                       return V_028C70_ENDIAN_8IN32;
-               default:
-                       return V_028C70_ENDIAN_NONE; /* Unsupported. */
-               }
-       } else {
-               return V_028C70_ENDIAN_NONE;
-       }
+   if (SI_BIG_ENDIAN) {
+      switch (colorformat) {
+      /* 8-bit buffers. */
+      case V_028C70_COLOR_8:
+         return V_028C70_ENDIAN_NONE;
+
+      /* 16-bit buffers. */
+      case V_028C70_COLOR_5_6_5:
+      case V_028C70_COLOR_1_5_5_5:
+      case V_028C70_COLOR_4_4_4_4:
+      case V_028C70_COLOR_16:
+      case V_028C70_COLOR_8_8:
+         return V_028C70_ENDIAN_8IN16;
+
+      /* 32-bit buffers. */
+      case V_028C70_COLOR_8_8_8_8:
+      case V_028C70_COLOR_2_10_10_10:
+      case V_028C70_COLOR_8_24:
+      case V_028C70_COLOR_24_8:
+      case V_028C70_COLOR_16_16:
+         return V_028C70_ENDIAN_8IN32;
+
+      /* 64-bit buffers. */
+      case V_028C70_COLOR_16_16_16_16:
+         return V_028C70_ENDIAN_8IN16;
+
+      case V_028C70_COLOR_32_32:
+         return V_028C70_ENDIAN_8IN32;
+
+      /* 128-bit buffers. */
+      case V_028C70_COLOR_32_32_32_32:
+         return V_028C70_ENDIAN_8IN32;
+      default:
+         return V_028C70_ENDIAN_NONE; /* Unsupported. */
+      }
+   } else {
+      return V_028C70_ENDIAN_NONE;
+   }
  }
  
  static uint32_t si_translate_dbformat(enum pipe_format format)
  {
-       switch (format) {
-       case PIPE_FORMAT_Z16_UNORM:
-               return V_028040_Z_16;
-       case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-       case PIPE_FORMAT_X8Z24_UNORM:
-       case PIPE_FORMAT_Z24X8_UNORM:
-       case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-               return V_028040_Z_24; /* deprecated on AMD GCN */
-       case PIPE_FORMAT_Z32_FLOAT:
-       case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-               return V_028040_Z_32_FLOAT;
-       default:
-               return V_028040_Z_INVALID;
-       }
+   switch (format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      return V_028040_Z_16;
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      return V_028040_Z_24; /* deprecated on AMD GCN */
+   case PIPE_FORMAT_Z32_FLOAT:
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      return V_028040_Z_32_FLOAT;
+   default:
+      return V_028040_Z_INVALID;
+   }
  }
  
  /*
   * Texture translation
   */
  
-static uint32_t si_translate_texformat(struct pipe_screen *screen,
-                                      enum pipe_format format,
-                                      const struct util_format_description *desc,
-                                      int first_non_void)
+static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_format format,
+                                       const struct util_format_description *desc,
+                                       int first_non_void)
  {
-       struct si_screen *sscreen = (struct si_screen*)screen;
-       bool uniform = true;
-       int i;
-
-       assert(sscreen->info.chip_class <= GFX9);
-
-       /* Colorspace (return non-RGB formats directly). */
-       switch (desc->colorspace) {
-       /* Depth stencil formats */
-       case UTIL_FORMAT_COLORSPACE_ZS:
-               switch (format) {
-               case PIPE_FORMAT_Z16_UNORM:
-                       return V_008F14_IMG_DATA_FORMAT_16;
-               case PIPE_FORMAT_X24S8_UINT:
-               case PIPE_FORMAT_S8X24_UINT:
-                       /*
-                        * Implemented as an 8_8_8_8 data format to fix texture
-                        * gathers in stencil sampling. This affects at least
-                        * GL45-CTS.texture_cube_map_array.sampling on GFX8.
-                        */
-                       if (sscreen->info.chip_class <= GFX8)
-                               return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
-
-                       if (format == PIPE_FORMAT_X24S8_UINT)
-                               return V_008F14_IMG_DATA_FORMAT_8_24;
-                       else
-                               return V_008F14_IMG_DATA_FORMAT_24_8;
-               case PIPE_FORMAT_Z24X8_UNORM:
-               case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-                       return V_008F14_IMG_DATA_FORMAT_8_24;
-               case PIPE_FORMAT_X8Z24_UNORM:
-               case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-                       return V_008F14_IMG_DATA_FORMAT_24_8;
-               case PIPE_FORMAT_S8_UINT:
-                       return V_008F14_IMG_DATA_FORMAT_8;
-               case PIPE_FORMAT_Z32_FLOAT:
-                       return V_008F14_IMG_DATA_FORMAT_32;
-               case PIPE_FORMAT_X32_S8X24_UINT:
-               case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-                       return V_008F14_IMG_DATA_FORMAT_X24_8_32;
-               default:
-                       goto out_unknown;
-               }
-
-       case UTIL_FORMAT_COLORSPACE_YUV:
-               goto out_unknown; /* TODO */
-
-       case UTIL_FORMAT_COLORSPACE_SRGB:
-               if (desc->nr_channels != 4 && desc->nr_channels != 1)
-                       goto out_unknown;
-               break;
-
-       default:
-               break;
-       }
-
-       if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
-               if (!sscreen->info.has_format_bc1_through_bc7)
-                       goto out_unknown;
-
-               switch (format) {
-               case PIPE_FORMAT_RGTC1_SNORM:
-               case PIPE_FORMAT_LATC1_SNORM:
-               case PIPE_FORMAT_RGTC1_UNORM:
-               case PIPE_FORMAT_LATC1_UNORM:
-                       return V_008F14_IMG_DATA_FORMAT_BC4;
-               case PIPE_FORMAT_RGTC2_SNORM:
-               case PIPE_FORMAT_LATC2_SNORM:
-               case PIPE_FORMAT_RGTC2_UNORM:
-               case PIPE_FORMAT_LATC2_UNORM:
-                       return V_008F14_IMG_DATA_FORMAT_BC5;
-               default:
-                       goto out_unknown;
-               }
-       }
-
-       if (desc->layout == UTIL_FORMAT_LAYOUT_ETC &&
-           (sscreen->info.family == CHIP_STONEY ||
-            sscreen->info.family == CHIP_VEGA10 ||
-            sscreen->info.family == CHIP_RAVEN)) {
-               switch (format) {
-               case PIPE_FORMAT_ETC1_RGB8:
-               case PIPE_FORMAT_ETC2_RGB8:
-               case PIPE_FORMAT_ETC2_SRGB8:
-                       return V_008F14_IMG_DATA_FORMAT_ETC2_RGB;
-               case PIPE_FORMAT_ETC2_RGB8A1:
-               case PIPE_FORMAT_ETC2_SRGB8A1:
-                       return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1;
-               case PIPE_FORMAT_ETC2_RGBA8:
-               case PIPE_FORMAT_ETC2_SRGBA8:
-                       return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA;
-               case PIPE_FORMAT_ETC2_R11_UNORM:
-               case PIPE_FORMAT_ETC2_R11_SNORM:
-                       return V_008F14_IMG_DATA_FORMAT_ETC2_R;
-               case PIPE_FORMAT_ETC2_RG11_UNORM:
-               case PIPE_FORMAT_ETC2_RG11_SNORM:
-                       return V_008F14_IMG_DATA_FORMAT_ETC2_RG;
-               default:
-                       goto out_unknown;
-               }
-       }
-
-       if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
-               if (!sscreen->info.has_format_bc1_through_bc7)
-                       goto out_unknown;
-
-               switch (format) {
-               case PIPE_FORMAT_BPTC_RGBA_UNORM:
-               case PIPE_FORMAT_BPTC_SRGBA:
-                       return V_008F14_IMG_DATA_FORMAT_BC7;
-               case PIPE_FORMAT_BPTC_RGB_FLOAT:
-               case PIPE_FORMAT_BPTC_RGB_UFLOAT:
-                       return V_008F14_IMG_DATA_FORMAT_BC6;
-               default:
-                       goto out_unknown;
-               }
-       }
-
-       if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
-               switch (format) {
-               case PIPE_FORMAT_R8G8_B8G8_UNORM:
-               case PIPE_FORMAT_G8R8_B8R8_UNORM:
-                       return V_008F14_IMG_DATA_FORMAT_GB_GR;
-               case PIPE_FORMAT_G8R8_G8B8_UNORM:
-               case PIPE_FORMAT_R8G8_R8B8_UNORM:
-                       return V_008F14_IMG_DATA_FORMAT_BG_RG;
-               default:
-                       goto out_unknown;
-               }
-       }
-
-       if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
-               if (!sscreen->info.has_format_bc1_through_bc7)
-                       goto out_unknown;
-
-               switch (format) {
-               case PIPE_FORMAT_DXT1_RGB:
-               case PIPE_FORMAT_DXT1_RGBA:
-               case PIPE_FORMAT_DXT1_SRGB:
-               case PIPE_FORMAT_DXT1_SRGBA:
-                       return V_008F14_IMG_DATA_FORMAT_BC1;
-               case PIPE_FORMAT_DXT3_RGBA:
-               case PIPE_FORMAT_DXT3_SRGBA:
-                       return V_008F14_IMG_DATA_FORMAT_BC2;
-               case PIPE_FORMAT_DXT5_RGBA:
-               case PIPE_FORMAT_DXT5_SRGBA:
-                       return V_008F14_IMG_DATA_FORMAT_BC3;
-               default:
-                       goto out_unknown;
-               }
-       }
-
-       if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
-               return V_008F14_IMG_DATA_FORMAT_5_9_9_9;
-       } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
-               return V_008F14_IMG_DATA_FORMAT_10_11_11;
-       }
-
-       /* R8G8Bx_SNORM - TODO CxV8U8 */
-
-       /* hw cannot support mixed formats (except depth/stencil, since only
-        * depth is read).*/
-       if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
-               goto out_unknown;
-
-       /* See whether the components are of the same size. */
-       for (i = 1; i < desc->nr_channels; i++) {
-               uniform = uniform && desc->channel[0].size == desc->channel[i].size;
-       }
-
-       /* Non-uniform formats. */
-       if (!uniform) {
-               switch(desc->nr_channels) {
-               case 3:
-                       if (desc->channel[0].size == 5 &&
-                           desc->channel[1].size == 6 &&
-                           desc->channel[2].size == 5) {
-                               return V_008F14_IMG_DATA_FORMAT_5_6_5;
-                       }
-                       goto out_unknown;
-               case 4:
-                       if (desc->channel[0].size == 5 &&
-                           desc->channel[1].size == 5 &&
-                           desc->channel[2].size == 5 &&
-                           desc->channel[3].size == 1) {
-                               return V_008F14_IMG_DATA_FORMAT_1_5_5_5;
-                       }
-                       if (desc->channel[0].size == 1 &&
-                           desc->channel[1].size == 5 &&
-                           desc->channel[2].size == 5 &&
-                           desc->channel[3].size == 5) {
-                               return V_008F14_IMG_DATA_FORMAT_5_5_5_1;
-                       }
-                       if (desc->channel[0].size == 10 &&
-                           desc->channel[1].size == 10 &&
-                           desc->channel[2].size == 10 &&
-                           desc->channel[3].size == 2) {
-                               return V_008F14_IMG_DATA_FORMAT_2_10_10_10;
-                       }
-                       goto out_unknown;
-               }
-               goto out_unknown;
-       }
-
-       if (first_non_void < 0 || first_non_void > 3)
-               goto out_unknown;
-
-       /* uniform formats */
-       switch (desc->channel[first_non_void].size) {
-       case 4:
-               switch (desc->nr_channels) {
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   bool uniform = true;
+   int i;
+
+   assert(sscreen->info.chip_class <= GFX9);
+
+   /* Colorspace (return non-RGB formats directly). */
+   switch (desc->colorspace) {
+   /* Depth stencil formats */
+   case UTIL_FORMAT_COLORSPACE_ZS:
+      switch (format) {
+      case PIPE_FORMAT_Z16_UNORM:
+         return V_008F14_IMG_DATA_FORMAT_16;
+      case PIPE_FORMAT_X24S8_UINT:
+      case PIPE_FORMAT_S8X24_UINT:
+         /*
+          * Implemented as an 8_8_8_8 data format to fix texture
+          * gathers in stencil sampling. This affects at least
+          * GL45-CTS.texture_cube_map_array.sampling on GFX8.
+          */
+         if (sscreen->info.chip_class <= GFX8)
+            return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
+
+         if (format == PIPE_FORMAT_X24S8_UINT)
+            return V_008F14_IMG_DATA_FORMAT_8_24;
+         else
+            return V_008F14_IMG_DATA_FORMAT_24_8;
+      case PIPE_FORMAT_Z24X8_UNORM:
+      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+         return V_008F14_IMG_DATA_FORMAT_8_24;
+      case PIPE_FORMAT_X8Z24_UNORM:
+      case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+         return V_008F14_IMG_DATA_FORMAT_24_8;
+      case PIPE_FORMAT_S8_UINT:
+         return V_008F14_IMG_DATA_FORMAT_8;
+      case PIPE_FORMAT_Z32_FLOAT:
+         return V_008F14_IMG_DATA_FORMAT_32;
+      case PIPE_FORMAT_X32_S8X24_UINT:
+      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+         return V_008F14_IMG_DATA_FORMAT_X24_8_32;
+      default:
+         goto out_unknown;
+      }
+
+   case UTIL_FORMAT_COLORSPACE_YUV:
+      goto out_unknown; /* TODO */
+
+   case UTIL_FORMAT_COLORSPACE_SRGB:
+      if (desc->nr_channels != 4 && desc->nr_channels != 1)
+         goto out_unknown;
+      break;
+
+   default:
+      break;
+   }
+
+   if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
+      if (!sscreen->info.has_format_bc1_through_bc7)
+         goto out_unknown;
+
+      switch (format) {
+      case PIPE_FORMAT_RGTC1_SNORM:
+      case PIPE_FORMAT_LATC1_SNORM:
+      case PIPE_FORMAT_RGTC1_UNORM:
+      case PIPE_FORMAT_LATC1_UNORM:
+         return V_008F14_IMG_DATA_FORMAT_BC4;
+      case PIPE_FORMAT_RGTC2_SNORM:
+      case PIPE_FORMAT_LATC2_SNORM:
+      case PIPE_FORMAT_RGTC2_UNORM:
+      case PIPE_FORMAT_LATC2_UNORM:
+         return V_008F14_IMG_DATA_FORMAT_BC5;
+      default:
+         goto out_unknown;
+      }
+   }
+
+   if (desc->layout == UTIL_FORMAT_LAYOUT_ETC &&
+       (sscreen->info.family == CHIP_STONEY || sscreen->info.family == CHIP_VEGA10 ||
+        sscreen->info.family == CHIP_RAVEN)) {
+      switch (format) {
+      case PIPE_FORMAT_ETC1_RGB8:
+      case PIPE_FORMAT_ETC2_RGB8:
+      case PIPE_FORMAT_ETC2_SRGB8:
+         return V_008F14_IMG_DATA_FORMAT_ETC2_RGB;
+      case PIPE_FORMAT_ETC2_RGB8A1:
+      case PIPE_FORMAT_ETC2_SRGB8A1:
+         return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1;
+      case PIPE_FORMAT_ETC2_RGBA8:
+      case PIPE_FORMAT_ETC2_SRGBA8:
+         return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA;
+      case PIPE_FORMAT_ETC2_R11_UNORM:
+      case PIPE_FORMAT_ETC2_R11_SNORM:
+         return V_008F14_IMG_DATA_FORMAT_ETC2_R;
+      case PIPE_FORMAT_ETC2_RG11_UNORM:
+      case PIPE_FORMAT_ETC2_RG11_SNORM:
+         return V_008F14_IMG_DATA_FORMAT_ETC2_RG;
+      default:
+         goto out_unknown;
+      }
+   }
+
+   if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
+      if (!sscreen->info.has_format_bc1_through_bc7)
+         goto out_unknown;
+
+      switch (format) {
+      case PIPE_FORMAT_BPTC_RGBA_UNORM:
+      case PIPE_FORMAT_BPTC_SRGBA:
+         return V_008F14_IMG_DATA_FORMAT_BC7;
+      case PIPE_FORMAT_BPTC_RGB_FLOAT:
+      case PIPE_FORMAT_BPTC_RGB_UFLOAT:
+         return V_008F14_IMG_DATA_FORMAT_BC6;
+      default:
+         goto out_unknown;
+      }
+   }
+
+   if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
+      switch (format) {
+      case PIPE_FORMAT_R8G8_B8G8_UNORM:
+      case PIPE_FORMAT_G8R8_B8R8_UNORM:
+         return V_008F14_IMG_DATA_FORMAT_GB_GR;
+      case PIPE_FORMAT_G8R8_G8B8_UNORM:
+      case PIPE_FORMAT_R8G8_R8B8_UNORM:
+         return V_008F14_IMG_DATA_FORMAT_BG_RG;
+      default:
+         goto out_unknown;
+      }
+   }
+
+   if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+      if (!sscreen->info.has_format_bc1_through_bc7)
+         goto out_unknown;
+
+      switch (format) {
+      case PIPE_FORMAT_DXT1_RGB:
+      case PIPE_FORMAT_DXT1_RGBA:
+      case PIPE_FORMAT_DXT1_SRGB:
+      case PIPE_FORMAT_DXT1_SRGBA:
+         return V_008F14_IMG_DATA_FORMAT_BC1;
+      case PIPE_FORMAT_DXT3_RGBA:
+      case PIPE_FORMAT_DXT3_SRGBA:
+         return V_008F14_IMG_DATA_FORMAT_BC2;
+      case PIPE_FORMAT_DXT5_RGBA:
+      case PIPE_FORMAT_DXT5_SRGBA:
+         return V_008F14_IMG_DATA_FORMAT_BC3;
+      default:
+         goto out_unknown;
+      }
+   }
+
+   if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+      return V_008F14_IMG_DATA_FORMAT_5_9_9_9;
+   } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      return V_008F14_IMG_DATA_FORMAT_10_11_11;
+   }
+
+   /* R8G8Bx_SNORM - TODO CxV8U8 */
+
+   /* hw cannot support mixed formats (except depth/stencil, since only
+    * depth is read).*/
+   if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+      goto out_unknown;
+
+   /* See whether the components are of the same size. */
+   for (i = 1; i < desc->nr_channels; i++) {
+      uniform = uniform && desc->channel[0].size == desc->channel[i].size;
+   }
+
+   /* Non-uniform formats. */
+   if (!uniform) {
+      switch (desc->nr_channels) {
+      case 3:
+         if (desc->channel[0].size == 5 && desc->channel[1].size == 6 &&
+             desc->channel[2].size == 5) {
+            return V_008F14_IMG_DATA_FORMAT_5_6_5;
+         }
+         goto out_unknown;
+      case 4:
+         if (desc->channel[0].size == 5 && desc->channel[1].size == 5 &&
+             desc->channel[2].size == 5 && desc->channel[3].size == 1) {
+            return V_008F14_IMG_DATA_FORMAT_1_5_5_5;
+         }
+         if (desc->channel[0].size == 1 && desc->channel[1].size == 5 &&
+             desc->channel[2].size == 5 && desc->channel[3].size == 5) {
+            return V_008F14_IMG_DATA_FORMAT_5_5_5_1;
+         }
+         if (desc->channel[0].size == 10 && desc->channel[1].size == 10 &&
+             desc->channel[2].size == 10 && desc->channel[3].size == 2) {
+            return V_008F14_IMG_DATA_FORMAT_2_10_10_10;
+         }
+         goto out_unknown;
+      }
+      goto out_unknown;
+   }
+
+   if (first_non_void < 0 || first_non_void > 3)
+      goto out_unknown;
+
+   /* uniform formats */
+   switch (desc->channel[first_non_void].size) {
+   case 4:
+      switch (desc->nr_channels) {
  #if 0 /* Not supported for render targets */
                 case 2:
                         return V_008F14_IMG_DATA_FORMAT_4_4;
  #endif
-               case 4:
-                       return V_008F14_IMG_DATA_FORMAT_4_4_4_4;
-               }
-               break;
-       case 8:
-               switch (desc->nr_channels) {
-               case 1:
-                       return V_008F14_IMG_DATA_FORMAT_8;
-               case 2:
-                       return V_008F14_IMG_DATA_FORMAT_8_8;
-               case 4:
-                       return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
-               }
-               break;
-       case 16:
-               switch (desc->nr_channels) {
-               case 1:
-                       return V_008F14_IMG_DATA_FORMAT_16;
-               case 2:
-                       return V_008F14_IMG_DATA_FORMAT_16_16;
-               case 4:
-                       return V_008F14_IMG_DATA_FORMAT_16_16_16_16;
-               }
-               break;
-       case 32:
-               switch (desc->nr_channels) {
-               case 1:
-                       return V_008F14_IMG_DATA_FORMAT_32;
-               case 2:
-                       return V_008F14_IMG_DATA_FORMAT_32_32;
+      case 4:
+         return V_008F14_IMG_DATA_FORMAT_4_4_4_4;
+      }
+      break;
+   case 8:
+      switch (desc->nr_channels) {
+      case 1:
+         return V_008F14_IMG_DATA_FORMAT_8;
+      case 2:
+         return V_008F14_IMG_DATA_FORMAT_8_8;
+      case 4:
+         return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
+      }
+      break;
+   case 16:
+      switch (desc->nr_channels) {
+      case 1:
+         return V_008F14_IMG_DATA_FORMAT_16;
+      case 2:
+         return V_008F14_IMG_DATA_FORMAT_16_16;
+      case 4:
+         return V_008F14_IMG_DATA_FORMAT_16_16_16_16;
+      }
+      break;
+   case 32:
+      switch (desc->nr_channels) {
+      case 1:
+         return V_008F14_IMG_DATA_FORMAT_32;
+      case 2:
+         return V_008F14_IMG_DATA_FORMAT_32_32;
  #if 0 /* Not supported for render targets */
                 case 3:
                         return V_008F14_IMG_DATA_FORMAT_32_32_32;
  #endif
-               case 4:
-                       return V_008F14_IMG_DATA_FORMAT_32_32_32_32;
-               }
-       }
+      case 4:
+         return V_008F14_IMG_DATA_FORMAT_32_32_32_32;
+      }
+   }
  
  out_unknown:
-       return ~0;
+   return ~0;
  }
  
  static unsigned si_tex_wrap(unsigned wrap)
  {
-       switch (wrap) {
-       default:
-       case PIPE_TEX_WRAP_REPEAT:
-               return V_008F30_SQ_TEX_WRAP;
-       case PIPE_TEX_WRAP_CLAMP:
-               return V_008F30_SQ_TEX_CLAMP_HALF_BORDER;
-       case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-               return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL;
-       case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-               return V_008F30_SQ_TEX_CLAMP_BORDER;
-       case PIPE_TEX_WRAP_MIRROR_REPEAT:
-               return V_008F30_SQ_TEX_MIRROR;
-       case PIPE_TEX_WRAP_MIRROR_CLAMP:
-               return V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER;
-       case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-               return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL;
-       case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-               return V_008F30_SQ_TEX_MIRROR_ONCE_BORDER;
-       }
+   switch (wrap) {
+   default:
+   case PIPE_TEX_WRAP_REPEAT:
+      return V_008F30_SQ_TEX_WRAP;
+   case PIPE_TEX_WRAP_CLAMP:
+      return V_008F30_SQ_TEX_CLAMP_HALF_BORDER;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return V_008F30_SQ_TEX_CLAMP_BORDER;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return V_008F30_SQ_TEX_MIRROR;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      return V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return V_008F30_SQ_TEX_MIRROR_ONCE_BORDER;
+   }
  }
  
  static unsigned si_tex_mipfilter(unsigned filter)
  {
-       switch (filter) {
-       case PIPE_TEX_MIPFILTER_NEAREST:
-               return V_008F38_SQ_TEX_Z_FILTER_POINT;
-       case PIPE_TEX_MIPFILTER_LINEAR:
-               return V_008F38_SQ_TEX_Z_FILTER_LINEAR;
-       default:
-       case PIPE_TEX_MIPFILTER_NONE:
-               return V_008F38_SQ_TEX_Z_FILTER_NONE;
-       }
+   switch (filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      return V_008F38_SQ_TEX_Z_FILTER_POINT;
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      return V_008F38_SQ_TEX_Z_FILTER_LINEAR;
+   default:
+   case PIPE_TEX_MIPFILTER_NONE:
+      return V_008F38_SQ_TEX_Z_FILTER_NONE;
+   }
  }
  
  static unsigned si_tex_compare(unsigned compare)
  {
-       switch (compare) {
-       default:
-       case PIPE_FUNC_NEVER:
-               return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER;
-       case PIPE_FUNC_LESS:
-               return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS;
-       case PIPE_FUNC_EQUAL:
-               return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL;
-       case PIPE_FUNC_LEQUAL:
-               return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL;
-       case PIPE_FUNC_GREATER:
-               return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER;
-       case PIPE_FUNC_NOTEQUAL:
-               return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL;
-       case PIPE_FUNC_GEQUAL:
-               return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL;
-       case PIPE_FUNC_ALWAYS:
-               return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS;
-       }
+   switch (compare) {
+   default:
+   case PIPE_FUNC_NEVER:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER;
+   case PIPE_FUNC_LESS:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS;
+   case PIPE_FUNC_EQUAL:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL;
+   case PIPE_FUNC_LEQUAL:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL;
+   case PIPE_FUNC_GREATER:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER;
+   case PIPE_FUNC_NOTEQUAL:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL;
+   case PIPE_FUNC_GEQUAL:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL;
+   case PIPE_FUNC_ALWAYS:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS;
+   }
  }
  
-static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex,
-                          unsigned view_target, unsigned nr_samples)
+static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex, unsigned view_target,
+                           unsigned nr_samples)
  {
-       unsigned res_target = tex->buffer.b.b.target;
-
-       if (view_target == PIPE_TEXTURE_CUBE ||
-           view_target == PIPE_TEXTURE_CUBE_ARRAY)
-               res_target = view_target;
-       /* If interpreting cubemaps as something else, set 2D_ARRAY. */
-       else if (res_target == PIPE_TEXTURE_CUBE ||
-                res_target == PIPE_TEXTURE_CUBE_ARRAY)
-               res_target = PIPE_TEXTURE_2D_ARRAY;
-
-       /* GFX9 allocates 1D textures as 2D. */
-       if ((res_target == PIPE_TEXTURE_1D ||
-            res_target == PIPE_TEXTURE_1D_ARRAY) &&
-           sscreen->info.chip_class == GFX9 &&
-           tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) {
-               if (res_target == PIPE_TEXTURE_1D)
-                       res_target = PIPE_TEXTURE_2D;
-               else
-                       res_target = PIPE_TEXTURE_2D_ARRAY;
-       }
-
-       switch (res_target) {
-       default:
-       case PIPE_TEXTURE_1D:
-               return V_008F1C_SQ_RSRC_IMG_1D;
-       case PIPE_TEXTURE_1D_ARRAY:
-               return V_008F1C_SQ_RSRC_IMG_1D_ARRAY;
-       case PIPE_TEXTURE_2D:
-       case PIPE_TEXTURE_RECT:
-               return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA :
-                                       V_008F1C_SQ_RSRC_IMG_2D;
-       case PIPE_TEXTURE_2D_ARRAY:
-               return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY :
-                                       V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
-       case PIPE_TEXTURE_3D:
-               return V_008F1C_SQ_RSRC_IMG_3D;
-       case PIPE_TEXTURE_CUBE:
-       case PIPE_TEXTURE_CUBE_ARRAY:
-               return V_008F1C_SQ_RSRC_IMG_CUBE;
-       }
+   unsigned res_target = tex->buffer.b.b.target;
+
+   if (view_target == PIPE_TEXTURE_CUBE || view_target == PIPE_TEXTURE_CUBE_ARRAY)
+      res_target = view_target;
+   /* If interpreting cubemaps as something else, set 2D_ARRAY. */
+   else if (res_target == PIPE_TEXTURE_CUBE || res_target == PIPE_TEXTURE_CUBE_ARRAY)
+      res_target = PIPE_TEXTURE_2D_ARRAY;
+
+   /* GFX9 allocates 1D textures as 2D. */
+   if ((res_target == PIPE_TEXTURE_1D || res_target == PIPE_TEXTURE_1D_ARRAY) &&
+       sscreen->info.chip_class == GFX9 &&
+       tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) {
+      if (res_target == PIPE_TEXTURE_1D)
+         res_target = PIPE_TEXTURE_2D;
+      else
+         res_target = PIPE_TEXTURE_2D_ARRAY;
+   }
+
+   switch (res_target) {
+   default:
+   case PIPE_TEXTURE_1D:
+      return V_008F1C_SQ_RSRC_IMG_1D;
+   case PIPE_TEXTURE_1D_ARRAY:
+      return V_008F1C_SQ_RSRC_IMG_1D_ARRAY;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
+      return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA : V_008F1C_SQ_RSRC_IMG_2D;
+   case PIPE_TEXTURE_2D_ARRAY:
+      return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY : V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
+   case PIPE_TEXTURE_3D:
+      return V_008F1C_SQ_RSRC_IMG_3D;
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return V_008F1C_SQ_RSRC_IMG_CUBE;
+   }
  }
  
  /*
@@ -2034,1748 +1919,1663 @@ static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex,
  
  static bool si_is_sampler_format_supported(struct pipe_screen *screen, enum pipe_format format)
  {
-       struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_screen *sscreen = (struct si_screen *)screen;
  
-       if (sscreen->info.chip_class >= GFX10) {
-               const struct gfx10_format *fmt = &gfx10_format_table[format];
-               if (!fmt->img_format || fmt->buffers_only)
-                       return false;
-               return true;
-       }
+   if (sscreen->info.chip_class >= GFX10) {
+      const struct gfx10_format *fmt = &gfx10_format_table[format];
+      if (!fmt->img_format || fmt->buffers_only)
+         return false;
+      return true;
+   }
  
-       const struct util_format_description *desc = util_format_description(format);
-       if (!desc)
-               return false;
+   const struct util_format_description *desc = util_format_description(format);
+   if (!desc)
+      return false;
  
-       return si_translate_texformat(screen, format, desc,
-                                     util_format_get_first_non_void_channel(format)) != ~0U;
+   return si_translate_texformat(screen, format, desc,
+                                 util_format_get_first_non_void_channel(format)) != ~0U;
  }
  
  static uint32_t si_translate_buffer_dataformat(struct pipe_screen *screen,
-                                              const struct util_format_description *desc,
-                                              int first_non_void)
+                                               const struct util_format_description *desc,
+                                               int first_non_void)
  {
-       int i;
-
-       assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
-
-       if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
-               return V_008F0C_BUF_DATA_FORMAT_10_11_11;
-
-       assert(first_non_void >= 0);
-
-       if (desc->nr_channels == 4 &&
-           desc->channel[0].size == 10 &&
-           desc->channel[1].size == 10 &&
-           desc->channel[2].size == 10 &&
-           desc->channel[3].size == 2)
-               return V_008F0C_BUF_DATA_FORMAT_2_10_10_10;
-
-       /* See whether the components are of the same size. */
-       for (i = 0; i < desc->nr_channels; i++) {
-               if (desc->channel[first_non_void].size != desc->channel[i].size)
-                       return V_008F0C_BUF_DATA_FORMAT_INVALID;
-       }
-
-       switch (desc->channel[first_non_void].size) {
-       case 8:
-               switch (desc->nr_channels) {
-               case 1:
-               case 3: /* 3 loads */
-                       return V_008F0C_BUF_DATA_FORMAT_8;
-               case 2:
-                       return V_008F0C_BUF_DATA_FORMAT_8_8;
-               case 4:
-                       return V_008F0C_BUF_DATA_FORMAT_8_8_8_8;
-               }
-               break;
-       case 16:
-               switch (desc->nr_channels) {
-               case 1:
-               case 3: /* 3 loads */
-                       return V_008F0C_BUF_DATA_FORMAT_16;
-               case 2:
-                       return V_008F0C_BUF_DATA_FORMAT_16_16;
-               case 4:
-                       return V_008F0C_BUF_DATA_FORMAT_16_16_16_16;
-               }
-               break;
-       case 32:
-               switch (desc->nr_channels) {
-               case 1:
-                       return V_008F0C_BUF_DATA_FORMAT_32;
-               case 2:
-                       return V_008F0C_BUF_DATA_FORMAT_32_32;
-               case 3:
-                       return V_008F0C_BUF_DATA_FORMAT_32_32_32;
-               case 4:
-                       return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
-               }
-               break;
-       case 64:
-               /* Legacy double formats. */
-               switch (desc->nr_channels) {
-               case 1: /* 1 load */
-                       return V_008F0C_BUF_DATA_FORMAT_32_32;
-               case 2: /* 1 load */
-                       return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
-               case 3: /* 3 loads */
-                       return V_008F0C_BUF_DATA_FORMAT_32_32;
-               case 4: /* 2 loads */
-                       return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
-               }
-               break;
-       }
-
-       return V_008F0C_BUF_DATA_FORMAT_INVALID;
+   int i;
+
+   assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
+
+   if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
+      return V_008F0C_BUF_DATA_FORMAT_10_11_11;
+
+   assert(first_non_void >= 0);
+
+   if (desc->nr_channels == 4 && desc->channel[0].size == 10 && desc->channel[1].size == 10 &&
+       desc->channel[2].size == 10 && desc->channel[3].size == 2)
+      return V_008F0C_BUF_DATA_FORMAT_2_10_10_10;
+
+   /* See whether the components are of the same size. */
+   for (i = 0; i < desc->nr_channels; i++) {
+      if (desc->channel[first_non_void].size != desc->channel[i].size)
+         return V_008F0C_BUF_DATA_FORMAT_INVALID;
+   }
+
+   switch (desc->channel[first_non_void].size) {
+   case 8:
+      switch (desc->nr_channels) {
+      case 1:
+      case 3: /* 3 loads */
+         return V_008F0C_BUF_DATA_FORMAT_8;
+      case 2:
+         return V_008F0C_BUF_DATA_FORMAT_8_8;
+      case 4:
+         return V_008F0C_BUF_DATA_FORMAT_8_8_8_8;
+      }
+      break;
+   case 16:
+      switch (desc->nr_channels) {
+      case 1:
+      case 3: /* 3 loads */
+         return V_008F0C_BUF_DATA_FORMAT_16;
+      case 2:
+         return V_008F0C_BUF_DATA_FORMAT_16_16;
+      case 4:
+         return V_008F0C_BUF_DATA_FORMAT_16_16_16_16;
+      }
+      break;
+   case 32:
+      switch (desc->nr_channels) {
+      case 1:
+         return V_008F0C_BUF_DATA_FORMAT_32;
+      case 2:
+         return V_008F0C_BUF_DATA_FORMAT_32_32;
+      case 3:
+         return V_008F0C_BUF_DATA_FORMAT_32_32_32;
+      case 4:
+         return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
+      }
+      break;
+   case 64:
+      /* Legacy double formats. */
+      switch (desc->nr_channels) {
+      case 1: /* 1 load */
+         return V_008F0C_BUF_DATA_FORMAT_32_32;
+      case 2: /* 1 load */
+         return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
+      case 3: /* 3 loads */
+         return V_008F0C_BUF_DATA_FORMAT_32_32;
+      case 4: /* 2 loads */
+         return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
+      }
+      break;
+   }
+
+   return V_008F0C_BUF_DATA_FORMAT_INVALID;
  }
  
  static uint32_t si_translate_buffer_numformat(struct pipe_screen *screen,
-                                             const struct util_format_description *desc,
-                                             int first_non_void)
+                                              const struct util_format_description *desc,
+                                              int first_non_void)
  {
-       assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
-
-       if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
-               return V_008F0C_BUF_NUM_FORMAT_FLOAT;
-
-       assert(first_non_void >= 0);
-
-       switch (desc->channel[first_non_void].type) {
-       case UTIL_FORMAT_TYPE_SIGNED:
-       case UTIL_FORMAT_TYPE_FIXED:
-               if (desc->channel[first_non_void].size >= 32 ||
-                   desc->channel[first_non_void].pure_integer)
-                       return V_008F0C_BUF_NUM_FORMAT_SINT;
-               else if (desc->channel[first_non_void].normalized)
-                       return V_008F0C_BUF_NUM_FORMAT_SNORM;
-               else
-                       return V_008F0C_BUF_NUM_FORMAT_SSCALED;
-               break;
-       case UTIL_FORMAT_TYPE_UNSIGNED:
-               if (desc->channel[first_non_void].size >= 32 ||
-                   desc->channel[first_non_void].pure_integer)
-                       return V_008F0C_BUF_NUM_FORMAT_UINT;
-               else if (desc->channel[first_non_void].normalized)
-                       return V_008F0C_BUF_NUM_FORMAT_UNORM;
-               else
-                       return V_008F0C_BUF_NUM_FORMAT_USCALED;
-               break;
-       case UTIL_FORMAT_TYPE_FLOAT:
-       default:
-               return V_008F0C_BUF_NUM_FORMAT_FLOAT;
-       }
+   assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
+
+   if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
+      return V_008F0C_BUF_NUM_FORMAT_FLOAT;
+
+   assert(first_non_void >= 0);
+
+   switch (desc->channel[first_non_void].type) {
+   case UTIL_FORMAT_TYPE_SIGNED:
+   case UTIL_FORMAT_TYPE_FIXED:
+      if (desc->channel[first_non_void].size >= 32 || desc->channel[first_non_void].pure_integer)
+         return V_008F0C_BUF_NUM_FORMAT_SINT;
+      else if (desc->channel[first_non_void].normalized)
+         return V_008F0C_BUF_NUM_FORMAT_SNORM;
+      else
+         return V_008F0C_BUF_NUM_FORMAT_SSCALED;
+      break;
+   case UTIL_FORMAT_TYPE_UNSIGNED:
+      if (desc->channel[first_non_void].size >= 32 || desc->channel[first_non_void].pure_integer)
+         return V_008F0C_BUF_NUM_FORMAT_UINT;
+      else if (desc->channel[first_non_void].normalized)
+         return V_008F0C_BUF_NUM_FORMAT_UNORM;
+      else
+         return V_008F0C_BUF_NUM_FORMAT_USCALED;
+      break;
+   case UTIL_FORMAT_TYPE_FLOAT:
+   default:
+      return V_008F0C_BUF_NUM_FORMAT_FLOAT;
+   }
  }
  
-static unsigned si_is_vertex_format_supported(struct pipe_screen *screen,
-                                             enum pipe_format format,
-                                             unsigned usage)
+static unsigned si_is_vertex_format_supported(struct pipe_screen *screen, enum pipe_format format,
+                                              unsigned usage)
  {
-       struct si_screen *sscreen = (struct si_screen *)screen;
-       const struct util_format_description *desc;
-       int first_non_void;
-       unsigned data_format;
-
-       assert((usage & ~(PIPE_BIND_SHADER_IMAGE |
-                         PIPE_BIND_SAMPLER_VIEW |
-                         PIPE_BIND_VERTEX_BUFFER)) == 0);
-
-       desc = util_format_description(format);
-       if (!desc)
-               return 0;
-
-       /* There are no native 8_8_8 or 16_16_16 data formats, and we currently
-        * select 8_8_8_8 and 16_16_16_16 instead. This works reasonably well
-        * for read-only access (with caveats surrounding bounds checks), but
-        * obviously fails for write access which we have to implement for
-        * shader images. Luckily, OpenGL doesn't expect this to be supported
-        * anyway, and so the only impact is on PBO uploads / downloads, which
-        * shouldn't be expected to be fast for GL_RGB anyway.
-        */
-       if (desc->block.bits == 3 * 8 ||
-           desc->block.bits == 3 * 16) {
-               if (usage & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW)) {
-                   usage &= ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW);
-                       if (!usage)
-                               return 0;
-               }
-       }
-
-       if (sscreen->info.chip_class >= GFX10) {
-               const struct gfx10_format *fmt = &gfx10_format_table[format];
-               if (!fmt->img_format || fmt->img_format >= 128)
-                       return 0;
-               return usage;
-       }
-
-       first_non_void = util_format_get_first_non_void_channel(format);
-       data_format = si_translate_buffer_dataformat(screen, desc, first_non_void);
-       if (data_format == V_008F0C_BUF_DATA_FORMAT_INVALID)
-               return 0;
-
-       return usage;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   const struct util_format_description *desc;
+   int first_non_void;
+   unsigned data_format;
+
+   assert((usage & ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_VERTEX_BUFFER)) ==
+          0);
+
+   desc = util_format_description(format);
+   if (!desc)
+      return 0;
+
+   /* There are no native 8_8_8 or 16_16_16 data formats, and we currently
+    * select 8_8_8_8 and 16_16_16_16 instead. This works reasonably well
+    * for read-only access (with caveats surrounding bounds checks), but
+    * obviously fails for write access which we have to implement for
+    * shader images. Luckily, OpenGL doesn't expect this to be supported
+    * anyway, and so the only impact is on PBO uploads / downloads, which
+    * shouldn't be expected to be fast for GL_RGB anyway.
+    */
+   if (desc->block.bits == 3 * 8 || desc->block.bits == 3 * 16) {
+      if (usage & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW)) {
+         usage &= ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW);
+         if (!usage)
+            return 0;
+      }
+   }
+
+   if (sscreen->info.chip_class >= GFX10) {
+      const struct gfx10_format *fmt = &gfx10_format_table[format];
+      if (!fmt->img_format || fmt->img_format >= 128)
+         return 0;
+      return usage;
+   }
+
+   first_non_void = util_format_get_first_non_void_channel(format);
+   data_format = si_translate_buffer_dataformat(screen, desc, first_non_void);
+   if (data_format == V_008F0C_BUF_DATA_FORMAT_INVALID)
+      return 0;
+
+   return usage;
  }
  
  static bool si_is_colorbuffer_format_supported(enum pipe_format format)
  {
-       return si_translate_colorformat(format) != V_028C70_COLOR_INVALID &&
-               si_translate_colorswap(format, false) != ~0U;
+   return si_translate_colorformat(format) != V_028C70_COLOR_INVALID &&
+          si_translate_colorswap(format, false) != ~0U;
  }
  
  static bool si_is_zs_format_supported(enum pipe_format format)
  {
-       return si_translate_dbformat(format) != V_028040_Z_INVALID;
+   return si_translate_dbformat(format) != V_028040_Z_INVALID;
  }
  
-static bool si_is_format_supported(struct pipe_screen *screen,
-                                  enum pipe_format format,
-                                  enum pipe_texture_target target,
-                                  unsigned sample_count,
-                                  unsigned storage_sample_count,
-                                  unsigned usage)
+static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format format,
+                                   enum pipe_texture_target target, unsigned sample_count,
+                                   unsigned storage_sample_count, unsigned usage)
  {
-       struct si_screen *sscreen = (struct si_screen *)screen;
-       unsigned retval = 0;
-
-       if (target >= PIPE_MAX_TEXTURE_TYPES) {
-               PRINT_ERR("radeonsi: unsupported texture type %d\n", target);
-               return false;
-       }
-
-       if (MAX2(1, sample_count) < MAX2(1, storage_sample_count))
-               return false;
-
-       if (sample_count > 1) {
-               if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE))
-                       return false;
-
-               /* Only power-of-two sample counts are supported. */
-               if (!util_is_power_of_two_or_zero(sample_count) ||
-                   !util_is_power_of_two_or_zero(storage_sample_count))
-                       return false;
-
-               /* MSAA support without framebuffer attachments. */
-               if (format == PIPE_FORMAT_NONE && sample_count <= 16)
-                       return true;
-
-               if (!sscreen->info.has_eqaa_surface_allocator ||
-                   util_format_is_depth_or_stencil(format)) {
-                       /* Color without EQAA or depth/stencil. */
-                       if (sample_count > 8 ||
-                           sample_count != storage_sample_count)
-                               return false;
-               } else {
-                       /* Color with EQAA. */
-                       if (sample_count > 16 ||
-                           storage_sample_count > 8)
-                               return false;
-               }
-       }
-
-       if (usage & (PIPE_BIND_SAMPLER_VIEW |
-                    PIPE_BIND_SHADER_IMAGE)) {
-               if (target == PIPE_BUFFER) {
-                       retval |= si_is_vertex_format_supported(
-                               screen, format, usage & (PIPE_BIND_SAMPLER_VIEW |
-                                                        PIPE_BIND_SHADER_IMAGE));
-               } else {
-                       if (si_is_sampler_format_supported(screen, format))
-                               retval |= usage & (PIPE_BIND_SAMPLER_VIEW |
-                                                  PIPE_BIND_SHADER_IMAGE);
-               }
-       }
-
-       if ((usage & (PIPE_BIND_RENDER_TARGET |
-                     PIPE_BIND_DISPLAY_TARGET |
-                     PIPE_BIND_SCANOUT |
-                     PIPE_BIND_SHARED |
-                     PIPE_BIND_BLENDABLE)) &&
-           si_is_colorbuffer_format_supported(format)) {
-               retval |= usage &
-                         (PIPE_BIND_RENDER_TARGET |
-                          PIPE_BIND_DISPLAY_TARGET |
-                          PIPE_BIND_SCANOUT |
-                          PIPE_BIND_SHARED);
-               if (!util_format_is_pure_integer(format) &&
-                   !util_format_is_depth_or_stencil(format))
-                       retval |= usage & PIPE_BIND_BLENDABLE;
-       }
-
-       if ((usage & PIPE_BIND_DEPTH_STENCIL) &&
-           si_is_zs_format_supported(format)) {
-               retval |= PIPE_BIND_DEPTH_STENCIL;
-       }
-
-       if (usage & PIPE_BIND_VERTEX_BUFFER) {
-               retval |= si_is_vertex_format_supported(screen, format,
-                                                       PIPE_BIND_VERTEX_BUFFER);
-       }
-
-       if ((usage & PIPE_BIND_LINEAR) &&
-           !util_format_is_compressed(format) &&
-           !(usage & PIPE_BIND_DEPTH_STENCIL))
-               retval |= PIPE_BIND_LINEAR;
-
-       return retval == usage;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   unsigned retval = 0;
+
+   if (target >= PIPE_MAX_TEXTURE_TYPES) {
+      PRINT_ERR("radeonsi: unsupported texture type %d\n", target);
+      return false;
+   }
+
+   if (MAX2(1, sample_count) < MAX2(1, storage_sample_count))
+      return false;
+
+   if (sample_count > 1) {
+      if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE))
+         return false;
+
+      /* Only power-of-two sample counts are supported. */
+      if (!util_is_power_of_two_or_zero(sample_count) ||
+          !util_is_power_of_two_or_zero(storage_sample_count))
+         return false;
+
+      /* MSAA support without framebuffer attachments. */
+      if (format == PIPE_FORMAT_NONE && sample_count <= 16)
+         return true;
+
+      if (!sscreen->info.has_eqaa_surface_allocator || util_format_is_depth_or_stencil(format)) {
+         /* Color without EQAA or depth/stencil. */
+         if (sample_count > 8 || sample_count != storage_sample_count)
+            return false;
+      } else {
+         /* Color with EQAA. */
+         if (sample_count > 16 || storage_sample_count > 8)
+            return false;
+      }
+   }
+
+   if (usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) {
+      if (target == PIPE_BUFFER) {
+         retval |= si_is_vertex_format_supported(
+            screen, format, usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE));
+      } else {
+         if (si_is_sampler_format_supported(screen, format))
+            retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE);
+      }
+   }
+
+   if ((usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
+                 PIPE_BIND_SHARED | PIPE_BIND_BLENDABLE)) &&
+       si_is_colorbuffer_format_supported(format)) {
+      retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
+                         PIPE_BIND_SHARED);
+      if (!util_format_is_pure_integer(format) && !util_format_is_depth_or_stencil(format))
+         retval |= usage & PIPE_BIND_BLENDABLE;
+   }
+
+   if ((usage & PIPE_BIND_DEPTH_STENCIL) && si_is_zs_format_supported(format)) {
+      retval |= PIPE_BIND_DEPTH_STENCIL;
+   }
+
+   if (usage & PIPE_BIND_VERTEX_BUFFER) {
+      retval |= si_is_vertex_format_supported(screen, format, PIPE_BIND_VERTEX_BUFFER);
+   }
+
+   if ((usage & PIPE_BIND_LINEAR) && !util_format_is_compressed(format) &&
+       !(usage & PIPE_BIND_DEPTH_STENCIL))
+      retval |= PIPE_BIND_LINEAR;
+
+   return retval == usage;
  }
  
  /*
   * framebuffer handling
   */
  
-static void si_choose_spi_color_formats(struct si_surface *surf,
-                                       unsigned format, unsigned swap,
-                                       unsigned ntype, bool is_depth)
+static void si_choose_spi_color_formats(struct si_surface *surf, unsigned format, unsigned swap,
+                                        unsigned ntype, bool is_depth)
  {
-       /* Alpha is needed for alpha-to-coverage.
-        * Blending may be with or without alpha.
-        */
-       unsigned normal = 0; /* most optimal, may not support blending or export alpha */
-       unsigned alpha = 0; /* exports alpha, but may not support blending */
-       unsigned blend = 0; /* supports blending, but may not export alpha */
-       unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */
-
-       /* Choose the SPI color formats. These are required values for RB+.
-        * Other chips have multiple choices, though they are not necessarily better.
-        */
-       switch (format) {
-       case V_028C70_COLOR_5_6_5:
-       case V_028C70_COLOR_1_5_5_5:
-       case V_028C70_COLOR_5_5_5_1:
-       case V_028C70_COLOR_4_4_4_4:
-       case V_028C70_COLOR_10_11_11:
-       case V_028C70_COLOR_11_11_10:
-       case V_028C70_COLOR_8:
-       case V_028C70_COLOR_8_8:
-       case V_028C70_COLOR_8_8_8_8:
-       case V_028C70_COLOR_10_10_10_2:
-       case V_028C70_COLOR_2_10_10_10:
-               if (ntype == V_028C70_NUMBER_UINT)
-                       alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
-               else if (ntype == V_028C70_NUMBER_SINT)
-                       alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
-               else
-                       alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
-               break;
-
-       case V_028C70_COLOR_16:
-       case V_028C70_COLOR_16_16:
-       case V_028C70_COLOR_16_16_16_16:
-               if (ntype == V_028C70_NUMBER_UNORM ||
-                   ntype == V_028C70_NUMBER_SNORM) {
-                       /* UNORM16 and SNORM16 don't support blending */
-                       if (ntype == V_028C70_NUMBER_UNORM)
-                               normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR;
-                       else
-                               normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR;
-
-                       /* Use 32 bits per channel for blending. */
-                       if (format == V_028C70_COLOR_16) {
-                               if (swap == V_028C70_SWAP_STD) { /* R */
-                                       blend = V_028714_SPI_SHADER_32_R;
-                                       blend_alpha = V_028714_SPI_SHADER_32_AR;
-                               } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
-                                       blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
-                               else
-                                       assert(0);
-                       } else if (format == V_028C70_COLOR_16_16) {
-                               if (swap == V_028C70_SWAP_STD) { /* RG */
-                                       blend = V_028714_SPI_SHADER_32_GR;
-                                       blend_alpha = V_028714_SPI_SHADER_32_ABGR;
-                               } else if (swap == V_028C70_SWAP_ALT) /* RA */
-                                       blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
-                               else
-                                       assert(0);
-                       } else /* 16_16_16_16 */
-                               blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
-               } else if (ntype == V_028C70_NUMBER_UINT)
-                       alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
-               else if (ntype == V_028C70_NUMBER_SINT)
-                       alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
-               else if (ntype == V_028C70_NUMBER_FLOAT)
-                       alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
-               else
-                       assert(0);
-               break;
-
-       case V_028C70_COLOR_32:
-               if (swap == V_028C70_SWAP_STD) { /* R */
-                       blend = normal = V_028714_SPI_SHADER_32_R;
-                       alpha = blend_alpha = V_028714_SPI_SHADER_32_AR;
-               } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
-                       alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
-               else
-                       assert(0);
-               break;
-
-       case V_028C70_COLOR_32_32:
-               if (swap == V_028C70_SWAP_STD) { /* RG */
-                       blend = normal = V_028714_SPI_SHADER_32_GR;
-                       alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
-               } else if (swap == V_028C70_SWAP_ALT) /* RA */
-                       alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
-               else
-                       assert(0);
-               break;
-
-       case V_028C70_COLOR_32_32_32_32:
-       case V_028C70_COLOR_8_24:
-       case V_028C70_COLOR_24_8:
-       case V_028C70_COLOR_X24_8_32_FLOAT:
-               alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
-               break;
-
-       default:
-               assert(0);
-               return;
-       }
-
-       /* The DB->CB copy needs 32_ABGR. */
-       if (is_depth)
-               alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
-
-       surf->spi_shader_col_format = normal;
-       surf->spi_shader_col_format_alpha = alpha;
-       surf->spi_shader_col_format_blend = blend;
-       surf->spi_shader_col_format_blend_alpha = blend_alpha;
+   /* Alpha is needed for alpha-to-coverage.
+    * Blending may be with or without alpha.
+    */
+   unsigned normal = 0;      /* most optimal, may not support blending or export alpha */
+   unsigned alpha = 0;       /* exports alpha, but may not support blending */
+   unsigned blend = 0;       /* supports blending, but may not export alpha */
+   unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */
+
+   /* Choose the SPI color formats. These are required values for RB+.
+    * Other chips have multiple choices, though they are not necessarily better.
+    */
+   switch (format) {
+   case V_028C70_COLOR_5_6_5:
+   case V_028C70_COLOR_1_5_5_5:
+   case V_028C70_COLOR_5_5_5_1:
+   case V_028C70_COLOR_4_4_4_4:
+   case V_028C70_COLOR_10_11_11:
+   case V_028C70_COLOR_11_11_10:
+   case V_028C70_COLOR_8:
+   case V_028C70_COLOR_8_8:
+   case V_028C70_COLOR_8_8_8_8:
+   case V_028C70_COLOR_10_10_10_2:
+   case V_028C70_COLOR_2_10_10_10:
+      if (ntype == V_028C70_NUMBER_UINT)
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
+      else if (ntype == V_028C70_NUMBER_SINT)
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
+      else
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
+      break;
+
+   case V_028C70_COLOR_16:
+   case V_028C70_COLOR_16_16:
+   case V_028C70_COLOR_16_16_16_16:
+      if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) {
+         /* UNORM16 and SNORM16 don't support blending */
+         if (ntype == V_028C70_NUMBER_UNORM)
+            normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR;
+         else
+            normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR;
+
+         /* Use 32 bits per channel for blending. */
+         if (format == V_028C70_COLOR_16) {
+            if (swap == V_028C70_SWAP_STD) { /* R */
+               blend = V_028714_SPI_SHADER_32_R;
+               blend_alpha = V_028714_SPI_SHADER_32_AR;
+            } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
+               blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
+            else
+               assert(0);
+         } else if (format == V_028C70_COLOR_16_16) {
+            if (swap == V_028C70_SWAP_STD) { /* RG */
+               blend = V_028714_SPI_SHADER_32_GR;
+               blend_alpha = V_028714_SPI_SHADER_32_ABGR;
+            } else if (swap == V_028C70_SWAP_ALT) /* RA */
+               blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
+            else
+               assert(0);
+         } else /* 16_16_16_16 */
+            blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
+      } else if (ntype == V_028C70_NUMBER_UINT)
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
+      else if (ntype == V_028C70_NUMBER_SINT)
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
+      else if (ntype == V_028C70_NUMBER_FLOAT)
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
+      else
+         assert(0);
+      break;
+
+   case V_028C70_COLOR_32:
+      if (swap == V_028C70_SWAP_STD) { /* R */
+         blend = normal = V_028714_SPI_SHADER_32_R;
+         alpha = blend_alpha = V_028714_SPI_SHADER_32_AR;
+      } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
+      else
+         assert(0);
+      break;
+
+   case V_028C70_COLOR_32_32:
+      if (swap == V_028C70_SWAP_STD) { /* RG */
+         blend = normal = V_028714_SPI_SHADER_32_GR;
+         alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
+      } else if (swap == V_028C70_SWAP_ALT) /* RA */
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
+      else
+         assert(0);
+      break;
+
+   case V_028C70_COLOR_32_32_32_32:
+   case V_028C70_COLOR_8_24:
+   case V_028C70_COLOR_24_8:
+   case V_028C70_COLOR_X24_8_32_FLOAT:
+      alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
+      break;
+
+   default:
+      assert(0);
+      return;
+   }
+
+   /* The DB->CB copy needs 32_ABGR. */
+   if (is_depth)
+      alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
+
+   surf->spi_shader_col_format = normal;
+   surf->spi_shader_col_format_alpha = alpha;
+   surf->spi_shader_col_format_blend = blend;
+   surf->spi_shader_col_format_blend_alpha = blend_alpha;
  }
  
-static void si_initialize_color_surface(struct si_context *sctx,
-                                       struct si_surface *surf)
+static void si_initialize_color_surface(struct si_context *sctx, struct si_surface *surf)
  {
-       struct si_texture *tex = (struct si_texture*)surf->base.texture;
-       unsigned color_info, color_attrib;
-       unsigned format, swap, ntype, endian;
-       const struct util_format_description *desc;
-       int firstchan;
-       unsigned blend_clamp = 0, blend_bypass = 0;
-
-       desc = util_format_description(surf->base.format);
-       for (firstchan = 0; firstchan < 4; firstchan++) {
-               if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) {
-                       break;
-               }
-       }
-       if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) {
-               ntype = V_028C70_NUMBER_FLOAT;
-       } else {
-               ntype = V_028C70_NUMBER_UNORM;
-               if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
-                       ntype = V_028C70_NUMBER_SRGB;
-               else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) {
-                       if (desc->channel[firstchan].pure_integer) {
-                               ntype = V_028C70_NUMBER_SINT;
-                       } else {
-                               assert(desc->channel[firstchan].normalized);
-                               ntype = V_028C70_NUMBER_SNORM;
-                       }
-               } else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) {
-                       if (desc->channel[firstchan].pure_integer) {
-                               ntype = V_028C70_NUMBER_UINT;
-                       } else {
-                               assert(desc->channel[firstchan].normalized);
-                               ntype = V_028C70_NUMBER_UNORM;
-                       }
-               }
-       }
-
-       format = si_translate_colorformat(surf->base.format);
-       if (format == V_028C70_COLOR_INVALID) {
-               PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format);
-       }
-       assert(format != V_028C70_COLOR_INVALID);
-       swap = si_translate_colorswap(surf->base.format, false);
-       endian = si_colorformat_endian_swap(format);
-
-       /* blend clamp should be set for all NORM/SRGB types */
-       if (ntype == V_028C70_NUMBER_UNORM ||
-           ntype == V_028C70_NUMBER_SNORM ||
-           ntype == V_028C70_NUMBER_SRGB)
-               blend_clamp = 1;
-
-       /* set blend bypass according to docs if SINT/UINT or
-          8/24 COLOR variants */
-       if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT ||
-           format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 ||
-           format == V_028C70_COLOR_X24_8_32_FLOAT) {
-               blend_clamp = 0;
-               blend_bypass = 1;
-       }
-
-       if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT) {
-               if (format == V_028C70_COLOR_8 ||
-                   format == V_028C70_COLOR_8_8 ||
-                   format == V_028C70_COLOR_8_8_8_8)
-                       surf->color_is_int8 = true;
-               else if (format == V_028C70_COLOR_10_10_10_2 ||
-                        format == V_028C70_COLOR_2_10_10_10)
-                       surf->color_is_int10 = true;
-       }
-
-       color_info = S_028C70_FORMAT(format) |
-               S_028C70_COMP_SWAP(swap) |
-               S_028C70_BLEND_CLAMP(blend_clamp) |
-               S_028C70_BLEND_BYPASS(blend_bypass) |
-               S_028C70_SIMPLE_FLOAT(1) |
-               S_028C70_ROUND_MODE(ntype != V_028C70_NUMBER_UNORM &&
-                                   ntype != V_028C70_NUMBER_SNORM &&
-                                   ntype != V_028C70_NUMBER_SRGB &&
-                                   format != V_028C70_COLOR_8_24 &&
-                                   format != V_028C70_COLOR_24_8) |
-               S_028C70_NUMBER_TYPE(ntype) |
-               S_028C70_ENDIAN(endian);
-
-       /* Intensity is implemented as Red, so treat it that way. */
-       color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 ||
-                                                 util_format_is_intensity(surf->base.format));
-
-       if (tex->buffer.b.b.nr_samples > 1) {
-               unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples);
-               unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples);
-
-               color_attrib |= S_028C74_NUM_SAMPLES(log_samples) |
-                               S_028C74_NUM_FRAGMENTS(log_fragments);
-
-               if (tex->surface.fmask_offset) {
-                       color_info |= S_028C70_COMPRESSION(1);
-                       unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh);
-
-                       if (sctx->chip_class == GFX6) {
-                               /* due to a hw bug, FMASK_BANK_HEIGHT must be set on GFX6 too */
-                               color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh);
-                       }
-               }
-       }
-
-       if (sctx->chip_class >= GFX10) {
-               unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
-
-               /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
-                  64 for APU because all of our APUs to date use DIMMs which have
-                  a request granularity size of 64B while all other chips have a
-                  32B request size */
-               if (!sctx->screen->info.has_dedicated_vram)
-                       min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
-
-               surf->cb_dcc_control =
-                       S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
-                       S_028C78_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
-                       S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
-                       S_028C78_INDEPENDENT_64B_BLOCKS(0) |
-                       S_028C78_INDEPENDENT_128B_BLOCKS(1);
-       } else if (sctx->chip_class >= GFX8) {
-               unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B;
-               unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
-
-               /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
-                  64 for APU because all of our APUs to date use DIMMs which have
-                  a request granularity size of 64B while all other chips have a
-                  32B request size */
-               if (!sctx->screen->info.has_dedicated_vram)
-                       min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
-
-               if (tex->buffer.b.b.nr_storage_samples > 1) {
-                       if (tex->surface.bpe == 1)
-                               max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
-                       else if (tex->surface.bpe == 2)
-                               max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
-               }
-
-               surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) |
-                                      S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
-                                      S_028C78_INDEPENDENT_64B_BLOCKS(1);
-       }
-
-       /* This must be set for fast clear to work without FMASK. */
-       if (!tex->surface.fmask_size && sctx->chip_class == GFX6) {
-               unsigned bankh = util_logbase2(tex->surface.u.legacy.bankh);
-               color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh);
-       }
-
-       /* GFX10 field has the same base shift as the GFX6 field */
-       unsigned color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) |
-                             S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer);
-       unsigned mip0_depth = util_max_layer(&tex->buffer.b.b, 0);
-
-       if (sctx->chip_class >= GFX10) {
-               color_view |= S_028C6C_MIP_LEVEL_GFX10(surf->base.u.tex.level);
-
-               surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(mip0_depth) |
-                                        S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) |
-                                        S_028EE0_RESOURCE_LEVEL(1);
-       } else if (sctx->chip_class == GFX9) {
-               color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level);
-               color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) |
-                               S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type);
-       }
-
-       if (sctx->chip_class >= GFX9) {
-               surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) |
-                                        S_028C68_MIP0_HEIGHT(surf->height0 - 1) |
-                                        S_028C68_MAX_MIP(tex->buffer.b.b.last_level);
-       }
-
-       surf->cb_color_view = color_view;
-       surf->cb_color_info = color_info;
-       surf->cb_color_attrib = color_attrib;
-
-       /* Determine pixel shader export format */
-       si_choose_spi_color_formats(surf, format, swap, ntype, tex->is_depth);
-
-       surf->color_initialized = true;
+   struct si_texture *tex = (struct si_texture *)surf->base.texture;
+   unsigned color_info, color_attrib;
+   unsigned format, swap, ntype, endian;
+   const struct util_format_description *desc;
+   int firstchan;
+   unsigned blend_clamp = 0, blend_bypass = 0;
+
+   desc = util_format_description(surf->base.format);
+   for (firstchan = 0; firstchan < 4; firstchan++) {
+      if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) {
+         break;
+      }
+   }
+   if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) {
+      ntype = V_028C70_NUMBER_FLOAT;
+   } else {
+      ntype = V_028C70_NUMBER_UNORM;
+      if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
+         ntype = V_028C70_NUMBER_SRGB;
+      else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) {
+         if (desc->channel[firstchan].pure_integer) {
+            ntype = V_028C70_NUMBER_SINT;
+         } else {
+            assert(desc->channel[firstchan].normalized);
+            ntype = V_028C70_NUMBER_SNORM;
+         }
+      } else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+         if (desc->channel[firstchan].pure_integer) {
+            ntype = V_028C70_NUMBER_UINT;
+         } else {
+            assert(desc->channel[firstchan].normalized);
+            ntype = V_028C70_NUMBER_UNORM;
+         }
+      }
+   }
+
+   format = si_translate_colorformat(surf->base.format);
+   if (format == V_028C70_COLOR_INVALID) {
+      PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format);
+   }
+   assert(format != V_028C70_COLOR_INVALID);
+   swap = si_translate_colorswap(surf->base.format, false);
+   endian = si_colorformat_endian_swap(format);
+
+   /* blend clamp should be set for all NORM/SRGB types */
+   if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM ||
+       ntype == V_028C70_NUMBER_SRGB)
+      blend_clamp = 1;
+
+   /* set blend bypass according to docs if SINT/UINT or
+      8/24 COLOR variants */
+   if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT ||
+       format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 ||
+       format == V_028C70_COLOR_X24_8_32_FLOAT) {
+      blend_clamp = 0;
+      blend_bypass = 1;
+   }
+
+   if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT) {
+      if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_8_8 ||
+          format == V_028C70_COLOR_8_8_8_8)
+         surf->color_is_int8 = true;
+      else if (format == V_028C70_COLOR_10_10_10_2 || format == V_028C70_COLOR_2_10_10_10)
+         surf->color_is_int10 = true;
+   }
+
+   color_info =
+      S_028C70_FORMAT(format) | S_028C70_COMP_SWAP(swap) | S_028C70_BLEND_CLAMP(blend_clamp) |
+      S_028C70_BLEND_BYPASS(blend_bypass) | S_028C70_SIMPLE_FLOAT(1) |
+      S_028C70_ROUND_MODE(ntype != V_028C70_NUMBER_UNORM && ntype != V_028C70_NUMBER_SNORM &&
+                          ntype != V_028C70_NUMBER_SRGB && format != V_028C70_COLOR_8_24 &&
+                          format != V_028C70_COLOR_24_8) |
+      S_028C70_NUMBER_TYPE(ntype) | S_028C70_ENDIAN(endian);
+
+   /* Intensity is implemented as Red, so treat it that way. */
+   color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 ||
+                                             util_format_is_intensity(surf->base.format));
+
+   if (tex->buffer.b.b.nr_samples > 1) {
+      unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples);
+      unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples);
+
+      color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | S_028C74_NUM_FRAGMENTS(log_fragments);
+
+      if (tex->surface.fmask_offset) {
+         color_info |= S_028C70_COMPRESSION(1);
+         unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh);
+
+         if (sctx->chip_class == GFX6) {
+            /* due to a hw bug, FMASK_BANK_HEIGHT must be set on GFX6 too */
+            color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh);
+         }
+      }
+   }
+
+   if (sctx->chip_class >= GFX10) {
+      unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
+
+      /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
+         64 for APU because all of our APUs to date use DIMMs which have
+         a request granularity size of 64B while all other chips have a
+         32B request size */
+      if (!sctx->screen->info.has_dedicated_vram)
+         min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
+
+      surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
+                             S_028C78_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
+                             S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
+                             S_028C78_INDEPENDENT_64B_BLOCKS(0) |
+                             S_028C78_INDEPENDENT_128B_BLOCKS(1);
+   } else if (sctx->chip_class >= GFX8) {
+      unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B;
+      unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
+
+      /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
+         64 for APU because all of our APUs to date use DIMMs which have
+         a request granularity size of 64B while all other chips have a
+         32B request size */
+      if (!sctx->screen->info.has_dedicated_vram)
+         min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
+
+      if (tex->buffer.b.b.nr_storage_samples > 1) {
+         if (tex->surface.bpe == 1)
+            max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
+         else if (tex->surface.bpe == 2)
+            max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
+      }
+
+      surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) |
+                             S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
+                             S_028C78_INDEPENDENT_64B_BLOCKS(1);
+   }
+
+   /* This must be set for fast clear to work without FMASK. */
+   if (!tex->surface.fmask_size && sctx->chip_class == GFX6) {
+      unsigned bankh = util_logbase2(tex->surface.u.legacy.bankh);
+      color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh);
+   }
+
+   /* GFX10 field has the same base shift as the GFX6 field */
+   unsigned color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) |
+                         S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer);
+   unsigned mip0_depth = util_max_layer(&tex->buffer.b.b, 0);
+
+   if (sctx->chip_class >= GFX10) {
+      color_view |= S_028C6C_MIP_LEVEL_GFX10(surf->base.u.tex.level);
+
+      surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(mip0_depth) |
+                               S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) |
+                               S_028EE0_RESOURCE_LEVEL(1);
+   } else if (sctx->chip_class == GFX9) {
+      color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level);
+      color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) |
+                      S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type);
+   }
+
+   if (sctx->chip_class >= GFX9) {
+      surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) |
+                               S_028C68_MIP0_HEIGHT(surf->height0 - 1) |
+                               S_028C68_MAX_MIP(tex->buffer.b.b.last_level);
+   }
+
+   surf->cb_color_view = color_view;
+   surf->cb_color_info = color_info;
+   surf->cb_color_attrib = color_attrib;
+
+   /* Determine pixel shader export format */
+   si_choose_spi_color_formats(surf, format, swap, ntype, tex->is_depth);
+
+   surf->color_initialized = true;
  }
  
-static void si_init_depth_surface(struct si_context *sctx,
-                                 struct si_surface *surf)
+static void si_init_depth_surface(struct si_context *sctx, struct si_surface *surf)
  {
-       struct si_texture *tex = (struct si_texture*)surf->base.texture;
-       unsigned level = surf->base.u.tex.level;
-       unsigned format, stencil_format;
-       uint32_t z_info, s_info;
-
-       format = si_translate_dbformat(tex->db_render_format);
-       stencil_format = tex->surface.has_stencil ?
-                                V_028044_STENCIL_8 : V_028044_STENCIL_INVALID;
-
-       assert(format != V_028040_Z_INVALID);
-       if (format == V_028040_Z_INVALID)
-               PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format);
-
-       surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) |
-                             S_028008_SLICE_MAX(surf->base.u.tex.last_layer);
-       surf->db_htile_data_base = 0;
-       surf->db_htile_surface = 0;
-
-       if (sctx->chip_class >= GFX10) {
-               surf->db_depth_view |= S_028008_SLICE_START_HI(surf->base.u.tex.first_layer >> 11) |
-                                      S_028008_SLICE_MAX_HI(surf->base.u.tex.last_layer >> 11);
-       }
-
-       if (sctx->chip_class >= GFX9) {
-               assert(tex->surface.u.gfx9.surf_offset == 0);
-               surf->db_depth_base = tex->buffer.gpu_address >> 8;
-               surf->db_stencil_base = (tex->buffer.gpu_address +
-                                        tex->surface.u.gfx9.stencil_offset) >> 8;
-               z_info = S_028038_FORMAT(format) |
-                        S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) |
-                        S_028038_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
-                        S_028038_MAXMIP(tex->buffer.b.b.last_level);
-               s_info = S_02803C_FORMAT(stencil_format) |
-                        S_02803C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
-
-               if (sctx->chip_class == GFX9) {
-                       surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.surf.epitch);
-                       surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.stencil.epitch);
-               }
-               surf->db_depth_view |= S_028008_MIPID(level);
-               surf->db_depth_size = S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) |
-                                     S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1);
-
-               if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
-                       z_info |= S_028038_TILE_SURFACE_ENABLE(1) |
-                                 S_028038_ALLOW_EXPCLEAR(1);
-
-                       if (tex->tc_compatible_htile) {
-                               unsigned max_zplanes = 4;
-
-                               if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM &&
-                                   tex->buffer.b.b.nr_samples > 1)
-                                       max_zplanes = 2;
-
-                               z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1);
-
-                               if (sctx->chip_class >= GFX10) {
-                                       z_info |= S_028040_ITERATE_FLUSH(1);
-                                       s_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled);
-                               } else {
-                                       z_info |= S_028038_ITERATE_FLUSH(1);
-                                       s_info |= S_02803C_ITERATE_FLUSH(1);
-                               }
-                       }
-
-                       if (tex->surface.has_stencil && !tex->htile_stencil_disabled) {
-                               /* Stencil buffer workaround ported from the GFX6-GFX8 code.
-                                * See that for explanation.
-                                */
-                               s_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1);
-                       } else {
-                               /* Use all HTILE for depth if there's no stencil. */
-                               s_info |= S_02803C_TILE_STENCIL_DISABLE(1);
-                       }
-
-                       surf->db_htile_data_base = (tex->buffer.gpu_address +
-                                                   tex->surface.htile_offset) >> 8;
-                       surf->db_htile_surface = S_028ABC_FULL_CACHE(1) |
-                                                S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned);
-                       if (sctx->chip_class == GFX9) {
-                               surf->db_htile_surface |=
-                                       S_028ABC_RB_ALIGNED(tex->surface.u.gfx9.htile.rb_aligned);
-                       }
-               }
-       } else {
-               /* GFX6-GFX8 */
-               struct legacy_surf_level *levelinfo = &tex->surface.u.legacy.level[level];
-
-               assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);
-
-               surf->db_depth_base = (tex->buffer.gpu_address +
-                                      tex->surface.u.legacy.level[level].offset) >> 8;
-               surf->db_stencil_base = (tex->buffer.gpu_address +
-                                        tex->surface.u.legacy.stencil_level[level].offset) >> 8;
-
-               z_info = S_028040_FORMAT(format) |
-                        S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples));
-               s_info = S_028044_FORMAT(stencil_format);
-               surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile);
-
-               if (sctx->chip_class >= GFX7) {
-                       struct radeon_info *info = &sctx->screen->info;
-                       unsigned index = tex->surface.u.legacy.tiling_index[level];
-                       unsigned stencil_index = tex->surface.u.legacy.stencil_tiling_index[level];
-                       unsigned macro_index = tex->surface.u.legacy.macro_tile_index;
-                       unsigned tile_mode = info->si_tile_mode_array[index];
-                       unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index];
-                       unsigned macro_mode = info->cik_macrotile_mode_array[macro_index];
-
-                       surf->db_depth_info |=
-                               S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) |
-                               S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) |
-                               S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) |
-                               S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) |
-                               S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) |
-                               S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode));
-                       z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode));
-                       s_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode));
-               } else {
-                       unsigned tile_mode_index = si_tile_mode_index(tex, level, false);
-                       z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index);
-                       tile_mode_index = si_tile_mode_index(tex, level, true);
-                       s_info |= S_028044_TILE_MODE_INDEX(tile_mode_index);
-               }
-
-               surf->db_depth_size = S_028058_PITCH_TILE_MAX((levelinfo->nblk_x / 8) - 1) |
-                                     S_028058_HEIGHT_TILE_MAX((levelinfo->nblk_y / 8) - 1);
-               surf->db_depth_slice = S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x *
-                                                               levelinfo->nblk_y) / 64 - 1);
-
-               if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
-                       z_info |= S_028040_TILE_SURFACE_ENABLE(1) |
-                                 S_028040_ALLOW_EXPCLEAR(1);
-
-                       if (tex->surface.has_stencil) {
-                               /* Workaround: For a not yet understood reason, the
-                                * combination of MSAA, fast stencil clear and stencil
-                                * decompress messes with subsequent stencil buffer
-                                * uses. Problem was reproduced on Verde, Bonaire,
-                                * Tonga, and Carrizo.
-                                *
-                                * Disabling EXPCLEAR works around the problem.
-                                *
-                                * Check piglit's arb_texture_multisample-stencil-clear
-                                * test if you want to try changing this.
-                                */
-                               if (tex->buffer.b.b.nr_samples <= 1)
-                                       s_info |= S_028044_ALLOW_EXPCLEAR(1);
-                       } else if (!tex->tc_compatible_htile) {
-                               /* Use all of the htile_buffer for depth if there's no stencil.
-                                * This must not be set when TC-compatible HTILE is enabled
-                                * due to a hw bug.
-                                */
-                               s_info |= S_028044_TILE_STENCIL_DISABLE(1);
-                       }
-
-                       surf->db_htile_data_base = (tex->buffer.gpu_address +
-                                                   tex->surface.htile_offset) >> 8;
-                       surf->db_htile_surface = S_028ABC_FULL_CACHE(1);
-
-                       if (tex->tc_compatible_htile) {
-                               surf->db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
-
-                               /* 0 = full compression. N = only compress up to N-1 Z planes. */
-                               if (tex->buffer.b.b.nr_samples <= 1)
-                                       z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
-                               else if (tex->buffer.b.b.nr_samples <= 4)
-                                       z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
-                               else
-                                       z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
-                       }
-               }
-       }
-
-       surf->db_z_info = z_info;
-       surf->db_stencil_info = s_info;
-
-       surf->depth_initialized = true;
+   struct si_texture *tex = (struct si_texture *)surf->base.texture;
+   unsigned level = surf->base.u.tex.level;
+   unsigned format, stencil_format;
+   uint32_t z_info, s_info;
+
+   format = si_translate_dbformat(tex->db_render_format);
+   stencil_format = tex->surface.has_stencil ? V_028044_STENCIL_8 : V_028044_STENCIL_INVALID;
+
+   assert(format != V_028040_Z_INVALID);
+   if (format == V_028040_Z_INVALID)
+      PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format);
+
+   surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) |
+                         S_028008_SLICE_MAX(surf->base.u.tex.last_layer);
+   surf->db_htile_data_base = 0;
+   surf->db_htile_surface = 0;
+
+   if (sctx->chip_class >= GFX10) {
+      surf->db_depth_view |= S_028008_SLICE_START_HI(surf->base.u.tex.first_layer >> 11) |
+                             S_028008_SLICE_MAX_HI(surf->base.u.tex.last_layer >> 11);
+   }
+
+   if (sctx->chip_class >= GFX9) {
+      assert(tex->surface.u.gfx9.surf_offset == 0);
+      surf->db_depth_base = tex->buffer.gpu_address >> 8;
+      surf->db_stencil_base = (tex->buffer.gpu_address + tex->surface.u.gfx9.stencil_offset) >> 8;
+      z_info = S_028038_FORMAT(format) |
+               S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) |
+               S_028038_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
+               S_028038_MAXMIP(tex->buffer.b.b.last_level);
+      s_info = S_02803C_FORMAT(stencil_format) |
+               S_02803C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
+
+      if (sctx->chip_class == GFX9) {
+         surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.surf.epitch);
+         surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.stencil.epitch);
+      }
+      surf->db_depth_view |= S_028008_MIPID(level);
+      surf->db_depth_size =
+         S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) | S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1);
+
+      if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
+         z_info |= S_028038_TILE_SURFACE_ENABLE(1) | S_028038_ALLOW_EXPCLEAR(1);
+
+         if (tex->tc_compatible_htile) {
+            unsigned max_zplanes = 4;
+
+            if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM && tex->buffer.b.b.nr_samples > 1)
+               max_zplanes = 2;
+
+            z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1);
+
+            if (sctx->chip_class >= GFX10) {
+               z_info |= S_028040_ITERATE_FLUSH(1);
+               s_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled);
+            } else {
+               z_info |= S_028038_ITERATE_FLUSH(1);
+               s_info |= S_02803C_ITERATE_FLUSH(1);
+            }
+         }
+
+         if (tex->surface.has_stencil && !tex->htile_stencil_disabled) {
+            /* Stencil buffer workaround ported from the GFX6-GFX8 code.
+             * See that for explanation.
+             */
+            s_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1);
+         } else {
+            /* Use all HTILE for depth if there's no stencil. */
+            s_info |= S_02803C_TILE_STENCIL_DISABLE(1);
+         }
+
+         surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.htile_offset) >> 8;
+         surf->db_htile_surface =
+            S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned);
+         if (sctx->chip_class == GFX9) {
+            surf->db_htile_surface |= S_028ABC_RB_ALIGNED(tex->surface.u.gfx9.htile.rb_aligned);
+         }
+      }
+   } else {
+      /* GFX6-GFX8 */
+      struct legacy_surf_level *levelinfo = &tex->surface.u.legacy.level[level];
+
+      assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);
+
+      surf->db_depth_base =
+         (tex->buffer.gpu_address + tex->surface.u.legacy.level[level].offset) >> 8;
+      surf->db_stencil_base =
+         (tex->buffer.gpu_address + tex->surface.u.legacy.stencil_level[level].offset) >> 8;
+
+      z_info =
+         S_028040_FORMAT(format) | S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples));
+      s_info = S_028044_FORMAT(stencil_format);
+      surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile);
+
+      if (sctx->chip_class >= GFX7) {
+         struct radeon_info *info = &sctx->screen->info;
+         unsigned index = tex->surface.u.legacy.tiling_index[level];
+         unsigned stencil_index = tex->surface.u.legacy.stencil_tiling_index[level];
+         unsigned macro_index = tex->surface.u.legacy.macro_tile_index;
+         unsigned tile_mode = info->si_tile_mode_array[index];
+         unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index];
+         unsigned macro_mode = info->cik_macrotile_mode_array[macro_index];
+
+         surf->db_depth_info |= S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) |
+                                S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) |
+                                S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) |
+                                S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) |
+                                S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) |
+                                S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode));
+         z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode));
+         s_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode));
+      } else {
+         unsigned tile_mode_index = si_tile_mode_index(tex, level, false);
+         z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index);
+         tile_mode_index = si_tile_mode_index(tex, level, true);
+         s_info |= S_028044_TILE_MODE_INDEX(tile_mode_index);
+      }
+
+      surf->db_depth_size = S_028058_PITCH_TILE_MAX((levelinfo->nblk_x / 8) - 1) |
+                            S_028058_HEIGHT_TILE_MAX((levelinfo->nblk_y / 8) - 1);
+      surf->db_depth_slice =
+         S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x * levelinfo->nblk_y) / 64 - 1);
+
+      if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
+         z_info |= S_028040_TILE_SURFACE_ENABLE(1) | S_028040_ALLOW_EXPCLEAR(1);
+
+         if (tex->surface.has_stencil) {
+            /* Workaround: For a not yet understood reason, the
+             * combination of MSAA, fast stencil clear and stencil
+             * decompress messes with subsequent stencil buffer
+             * uses. Problem was reproduced on Verde, Bonaire,
+             * Tonga, and Carrizo.
+             *
+             * Disabling EXPCLEAR works around the problem.
+             *
+             * Check piglit's arb_texture_multisample-stencil-clear
+             * test if you want to try changing this.
+             */
+            if (tex->buffer.b.b.nr_samples <= 1)
+               s_info |= S_028044_ALLOW_EXPCLEAR(1);
+         } else if (!tex->tc_compatible_htile) {
+            /* Use all of the htile_buffer for depth if there's no stencil.
+             * This must not be set when TC-compatible HTILE is enabled
+             * due to a hw bug.
+             */
+            s_info |= S_028044_TILE_STENCIL_DISABLE(1);
+         }
+
+         surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.htile_offset) >> 8;
+         surf->db_htile_surface = S_028ABC_FULL_CACHE(1);
+
+         if (tex->tc_compatible_htile) {
+            surf->db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
+
+            /* 0 = full compression. N = only compress up to N-1 Z planes. */
+            if (tex->buffer.b.b.nr_samples <= 1)
+               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
+            else if (tex->buffer.b.b.nr_samples <= 4)
+               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
+            else
+               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
+         }
+      }
+   }
+
+   surf->db_z_info = z_info;
+   surf->db_stencil_info = s_info;
+
+   surf->depth_initialized = true;
  }
  
  void si_update_fb_dirtiness_after_rendering(struct si_context *sctx)
  {
-       if (sctx->decompression_enabled)
-               return;
-
-       if (sctx->framebuffer.state.zsbuf) {
-               struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
-               struct si_texture *tex = (struct si_texture *)surf->texture;
-
-               tex->dirty_level_mask |= 1 << surf->u.tex.level;
-
-               if (tex->surface.has_stencil)
-                       tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
-       }
-
-       unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask;
-       while (compressed_cb_mask) {
-               unsigned i = u_bit_scan(&compressed_cb_mask);
-               struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
-               struct si_texture *tex = (struct si_texture*)surf->texture;
-
-               if (tex->surface.fmask_offset) {
-                       tex->dirty_level_mask |= 1 << surf->u.tex.level;
-                       tex->fmask_is_identity = false;
-               }
-               if (tex->dcc_gather_statistics)
-                       tex->separate_dcc_dirty = true;
-       }
+   if (sctx->decompression_enabled)
+      return;
+
+   if (sctx->framebuffer.state.zsbuf) {
+      struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
+      struct si_texture *tex = (struct si_texture *)surf->texture;
+
+      tex->dirty_level_mask |= 1 << surf->u.tex.level;
+
+      if (tex->surface.has_stencil)
+         tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
+   }
+
+   unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask;
+   while (compressed_cb_mask) {
+      unsigned i = u_bit_scan(&compressed_cb_mask);
+      struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
+      struct si_texture *tex = (struct si_texture *)surf->texture;
+
+      if (tex->surface.fmask_offset) {
+         tex->dirty_level_mask |= 1 << surf->u.tex.level;
+         tex->fmask_is_identity = false;
+      }
+      if (tex->dcc_gather_statistics)
+         tex->separate_dcc_dirty = true;
+   }
  }
  
  static void si_dec_framebuffer_counters(const struct pipe_framebuffer_state *state)
  {
-       for (int i = 0; i < state->nr_cbufs; ++i) {
-               struct si_surface *surf = NULL;
-               struct si_texture *tex;
+   for (int i = 0; i < state->nr_cbufs; ++i) {
+      struct si_surface *surf = NULL;
+      struct si_texture *tex;
  
-               if (!state->cbufs[i])
-                       continue;
-               surf = (struct si_surface*)state->cbufs[i];
-               tex = (struct si_texture*)surf->base.texture;
+      if (!state->cbufs[i])
+         continue;
+      surf = (struct si_surface *)state->cbufs[i];
+      tex = (struct si_texture *)surf->base.texture;
  
-               p_atomic_dec(&tex->framebuffers_bound);
-       }
+      p_atomic_dec(&tex->framebuffers_bound);
+   }
  }
  
  static void si_set_framebuffer_state(struct pipe_context *ctx,
-                                    const struct pipe_framebuffer_state *state)
+                                     const struct pipe_framebuffer_state *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_surface *surf = NULL;
-       struct si_texture *tex;
-       bool old_any_dst_linear = sctx->framebuffer.any_dst_linear;
-       unsigned old_nr_samples = sctx->framebuffer.nr_samples;
-       unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit;
-       bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf;
-       bool old_has_stencil =
-               old_has_zsbuf &&
-               ((struct si_texture*)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil;
-       bool unbound = false;
-       int i;
-
-       /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs
-        * when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
-        * We could implement the full workaround here, but it's a useless case.
-        */
-       if ((!state->width || !state->height) && (state->nr_cbufs || state->zsbuf)) {
-               unreachable("the framebuffer shouldn't have zero area");
-               return;
-       }
-
-       si_update_fb_dirtiness_after_rendering(sctx);
-
-       for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-               if (!sctx->framebuffer.state.cbufs[i])
-                       continue;
-
-               tex = (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture;
-               if (tex->dcc_gather_statistics)
-                       vi_separate_dcc_stop_query(sctx, tex);
-       }
-
-       /* Disable DCC if the formats are incompatible. */
-       for (i = 0; i < state->nr_cbufs; i++) {
-               if (!state->cbufs[i])
-                       continue;
-
-               surf = (struct si_surface*)state->cbufs[i];
-               tex = (struct si_texture*)surf->base.texture;
-
-               if (!surf->dcc_incompatible)
-                       continue;
-
-               /* Since the DCC decompression calls back into set_framebuffer-
-                * _state, we need to unbind the framebuffer, so that
-                * vi_separate_dcc_stop_query isn't called twice with the same
-                * color buffer.
-                */
-               if (!unbound) {
-                       util_copy_framebuffer_state(&sctx->framebuffer.state, NULL);
-                       unbound = true;
-               }
-
-               if (vi_dcc_enabled(tex, surf->base.u.tex.level))
-                       if (!si_texture_disable_dcc(sctx, tex))
-                               si_decompress_dcc(sctx, tex);
-
-               surf->dcc_incompatible = false;
-       }
-
-       /* Only flush TC when changing the framebuffer state, because
-        * the only client not using TC that can change textures is
-        * the framebuffer.
-        *
-        * Wait for compute shaders because of possible transitions:
-        * - FB write -> shader read
-        * - shader write -> FB read
-        *
-        * DB caches are flushed on demand (using si_decompress_textures).
-        *
-        * When MSAA is enabled, CB and TC caches are flushed on demand
-        * (after FMASK decompression). Shader write -> FB read transitions
-        * cannot happen for MSAA textures, because MSAA shader images are
-        * not supported.
-        *
-        * Only flush and wait for CB if there is actually a bound color buffer.
-        */
-       if (sctx->framebuffer.uncompressed_cb_mask) {
-               si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
-                                          sctx->framebuffer.CB_has_shader_readable_metadata,
-                                          sctx->framebuffer.all_DCC_pipe_aligned);
-       }
-
-       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-
-       /* u_blitter doesn't invoke depth decompression when it does multiple
-        * blits in a row, but the only case when it matters for DB is when
-        * doing generate_mipmap. So here we flush DB manually between
-        * individual generate_mipmap blits.
-        * Note that lower mipmap levels aren't compressed.
-        */
-       if (sctx->generate_mipmap_for_depth) {
-               si_make_DB_shader_coherent(sctx, 1, false,
-                                          sctx->framebuffer.DB_has_shader_readable_metadata);
-       } else if (sctx->chip_class == GFX9) {
-               /* It appears that DB metadata "leaks" in a sequence of:
-                *  - depth clear
-                *  - DCC decompress for shader image writes (with DB disabled)
-                *  - render with DEPTH_BEFORE_SHADER=1
-                * Flushing DB metadata works around the problem.
-                */
-               sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META;
-       }
-
-       /* Take the maximum of the old and new count. If the new count is lower,
-        * dirtying is needed to disable the unbound colorbuffers.
-        */
-       sctx->framebuffer.dirty_cbufs |=
-               (1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1;
-       sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf;
-
-       si_dec_framebuffer_counters(&sctx->framebuffer.state);
-       util_copy_framebuffer_state(&sctx->framebuffer.state, state);
-
-       sctx->framebuffer.colorbuf_enabled_4bit = 0;
-       sctx->framebuffer.spi_shader_col_format = 0;
-       sctx->framebuffer.spi_shader_col_format_alpha = 0;
-       sctx->framebuffer.spi_shader_col_format_blend = 0;
-       sctx->framebuffer.spi_shader_col_format_blend_alpha = 0;
-       sctx->framebuffer.color_is_int8 = 0;
-       sctx->framebuffer.color_is_int10 = 0;
-
-       sctx->framebuffer.compressed_cb_mask = 0;
-       sctx->framebuffer.uncompressed_cb_mask = 0;
-       sctx->framebuffer.displayable_dcc_cb_mask = 0;
-       sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state);
-       sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples;
-       sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples);
-       sctx->framebuffer.any_dst_linear = false;
-       sctx->framebuffer.CB_has_shader_readable_metadata = false;
-       sctx->framebuffer.DB_has_shader_readable_metadata = false;
-       sctx->framebuffer.all_DCC_pipe_aligned = true;
-       sctx->framebuffer.min_bytes_per_pixel = 0;
-
-       for (i = 0; i < state->nr_cbufs; i++) {
-               if (!state->cbufs[i])
-                       continue;
-
-               surf = (struct si_surface*)state->cbufs[i];
-               tex = (struct si_texture*)surf->base.texture;
-
-               if (!surf->color_initialized) {
-                       si_initialize_color_surface(sctx, surf);
-               }
-
-               sctx->framebuffer.colorbuf_enabled_4bit |= 0xf << (i * 4);
-               sctx->framebuffer.spi_shader_col_format |=
-                       surf->spi_shader_col_format << (i * 4);
-               sctx->framebuffer.spi_shader_col_format_alpha |=
-                       surf->spi_shader_col_format_alpha << (i * 4);
-               sctx->framebuffer.spi_shader_col_format_blend |=
-                       surf->spi_shader_col_format_blend << (i * 4);
-               sctx->framebuffer.spi_shader_col_format_blend_alpha |=
-                       surf->spi_shader_col_format_blend_alpha << (i * 4);
-
-               if (surf->color_is_int8)
-                       sctx->framebuffer.color_is_int8 |= 1 << i;
-               if (surf->color_is_int10)
-                       sctx->framebuffer.color_is_int10 |= 1 << i;
-
-               if (tex->surface.fmask_offset)
-                       sctx->framebuffer.compressed_cb_mask |= 1 << i;
-               else
-                       sctx->framebuffer.uncompressed_cb_mask |= 1 << i;
-
-               if (tex->surface.dcc_offset)
-                       sctx->framebuffer.displayable_dcc_cb_mask |= 1 << i;
-
-               /* Don't update nr_color_samples for non-AA buffers.
-                * (e.g. destination of MSAA resolve)
-                */
-               if (tex->buffer.b.b.nr_samples >= 2 &&
-                   tex->buffer.b.b.nr_storage_samples < tex->buffer.b.b.nr_samples) {
-                       sctx->framebuffer.nr_color_samples =
-                               MIN2(sctx->framebuffer.nr_color_samples,
-                                    tex->buffer.b.b.nr_storage_samples);
-                       sctx->framebuffer.nr_color_samples =
-                               MAX2(1, sctx->framebuffer.nr_color_samples);
-               }
-
-               if (tex->surface.is_linear)
-                       sctx->framebuffer.any_dst_linear = true;
-
-               if (vi_dcc_enabled(tex, surf->base.u.tex.level)) {
-                       sctx->framebuffer.CB_has_shader_readable_metadata = true;
-
-                       if (sctx->chip_class >= GFX9 &&
-                           !tex->surface.u.gfx9.dcc.pipe_aligned)
-                               sctx->framebuffer.all_DCC_pipe_aligned = false;
-               }
-
-               si_context_add_resource_size(sctx, surf->base.texture);
-
-               p_atomic_inc(&tex->framebuffers_bound);
-
-               if (tex->dcc_gather_statistics) {
-                       /* Dirty tracking must be enabled for DCC usage analysis. */
-                       sctx->framebuffer.compressed_cb_mask |= 1 << i;
-                       vi_separate_dcc_start_query(sctx, tex);
-               }
-
-               /* Update the minimum but don't keep 0. */
-               if (!sctx->framebuffer.min_bytes_per_pixel ||
-                   tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
-                       sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe;
-       }
-
-       /* For optimal DCC performance. */
-       if (sctx->chip_class >= GFX10)
-               sctx->framebuffer.dcc_overwrite_combiner_watermark = 6;
-       else
-               sctx->framebuffer.dcc_overwrite_combiner_watermark = 4;
-
-       struct si_texture *zstex = NULL;
-
-       if (state->zsbuf) {
-               surf = (struct si_surface*)state->zsbuf;
-               zstex = (struct si_texture*)surf->base.texture;
-
-               if (!surf->depth_initialized) {
-                       si_init_depth_surface(sctx, surf);
-               }
-
-               if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level,
-                                              PIPE_MASK_ZS))
-                       sctx->framebuffer.DB_has_shader_readable_metadata = true;
-
-               si_context_add_resource_size(sctx, surf->base.texture);
-
-               /* Update the minimum but don't keep 0. */
-               if (!sctx->framebuffer.min_bytes_per_pixel ||
-                   zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
-                       sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe;
-       }
-
-       si_update_ps_colorbuf0_slot(sctx);
-       si_update_poly_offset_state(sctx);
-       si_update_ngg_small_prim_precision(sctx);
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
-
-       if (sctx->screen->dpbb_allowed)
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-
-       if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-
-       if (sctx->screen->has_out_of_order_rast &&
-           (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit ||
-            !!sctx->framebuffer.state.zsbuf != old_has_zsbuf ||
-            (zstex && zstex->surface.has_stencil != old_has_stencil)))
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-
-       if (sctx->framebuffer.nr_samples != old_nr_samples) {
-               struct pipe_constant_buffer constbuf = {0};
-
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-
-               constbuf.buffer = sctx->sample_pos_buffer;
-
-               /* Set sample locations as fragment shader constants. */
-               switch (sctx->framebuffer.nr_samples) {
-               case 1:
-                       constbuf.buffer_offset = 0;
-                       break;
-               case 2:
-                       constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x2 -
-                                                (ubyte*)sctx->sample_positions.x1;
-                       break;
-               case 4:
-                       constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x4 -
-                                                (ubyte*)sctx->sample_positions.x1;
-                       break;
-               case 8:
-                       constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x8 -
-                                                (ubyte*)sctx->sample_positions.x1;
-                       break;
-               case 16:
-                       constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x16 -
-                                                (ubyte*)sctx->sample_positions.x1;
-                       break;
-               default:
-                       PRINT_ERR("Requested an invalid number of samples %i.\n",
-                                sctx->framebuffer.nr_samples);
-                       assert(0);
-               }
-               constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4;
-               si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf);
-
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
-       }
-
-       sctx->do_update_shaders = true;
-
-       if (!sctx->decompression_enabled) {
-               /* Prevent textures decompression when the framebuffer state
-                * changes come from the decompression passes themselves.
-                */
-               sctx->need_check_render_feedback = true;
-       }
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_surface *surf = NULL;
+   struct si_texture *tex;
+   bool old_any_dst_linear = sctx->framebuffer.any_dst_linear;
+   unsigned old_nr_samples = sctx->framebuffer.nr_samples;
+   unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit;
+   bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf;
+   bool old_has_stencil =
+      old_has_zsbuf &&
+      ((struct si_texture *)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil;
+   bool unbound = false;
+   int i;
+
+   /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs
+    * when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
+    * We could implement the full workaround here, but it's a useless case.
+    */
+   if ((!state->width || !state->height) && (state->nr_cbufs || state->zsbuf)) {
+      unreachable("the framebuffer shouldn't have zero area");
+      return;
+   }
+
+   si_update_fb_dirtiness_after_rendering(sctx);
+
+   for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+      if (!sctx->framebuffer.state.cbufs[i])
+         continue;
+
+      tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
+      if (tex->dcc_gather_statistics)
+         vi_separate_dcc_stop_query(sctx, tex);
+   }
+
+   /* Disable DCC if the formats are incompatible. */
+   for (i = 0; i < state->nr_cbufs; i++) {
+      if (!state->cbufs[i])
+         continue;
+
+      surf = (struct si_surface *)state->cbufs[i];
+      tex = (struct si_texture *)surf->base.texture;
+
+      if (!surf->dcc_incompatible)
+         continue;
+
+      /* Since the DCC decompression calls back into set_framebuffer-
+       * _state, we need to unbind the framebuffer, so that
+       * vi_separate_dcc_stop_query isn't called twice with the same
+       * color buffer.
+       */
+      if (!unbound) {
+         util_copy_framebuffer_state(&sctx->framebuffer.state, NULL);
+         unbound = true;
+      }
+
+      if (vi_dcc_enabled(tex, surf->base.u.tex.level))
+         if (!si_texture_disable_dcc(sctx, tex))
+            si_decompress_dcc(sctx, tex);
+
+      surf->dcc_incompatible = false;
+   }
+
+   /* Only flush TC when changing the framebuffer state, because
+    * the only client not using TC that can change textures is
+    * the framebuffer.
+    *
+    * Wait for compute shaders because of possible transitions:
+    * - FB write -> shader read
+    * - shader write -> FB read
+    *
+    * DB caches are flushed on demand (using si_decompress_textures).
+    *
+    * When MSAA is enabled, CB and TC caches are flushed on demand
+    * (after FMASK decompression). Shader write -> FB read transitions
+    * cannot happen for MSAA textures, because MSAA shader images are
+    * not supported.
+    *
+    * Only flush and wait for CB if there is actually a bound color buffer.
+    */
+   if (sctx->framebuffer.uncompressed_cb_mask) {
+      si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
+                                 sctx->framebuffer.CB_has_shader_readable_metadata,
+                                 sctx->framebuffer.all_DCC_pipe_aligned);
+   }
+
+   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+   /* u_blitter doesn't invoke depth decompression when it does multiple
+    * blits in a row, but the only case when it matters for DB is when
+    * doing generate_mipmap. So here we flush DB manually between
+    * individual generate_mipmap blits.
+    * Note that lower mipmap levels aren't compressed.
+    */
+   if (sctx->generate_mipmap_for_depth) {
+      si_make_DB_shader_coherent(sctx, 1, false, sctx->framebuffer.DB_has_shader_readable_metadata);
+   } else if (sctx->chip_class == GFX9) {
+      /* It appears that DB metadata "leaks" in a sequence of:
+       *  - depth clear
+       *  - DCC decompress for shader image writes (with DB disabled)
+       *  - render with DEPTH_BEFORE_SHADER=1
+       * Flushing DB metadata works around the problem.
+       */
+      sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META;
+   }
+
+   /* Take the maximum of the old and new count. If the new count is lower,
+    * dirtying is needed to disable the unbound colorbuffers.
+    */
+   sctx->framebuffer.dirty_cbufs |=
+      (1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1;
+   sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf;
+
+   si_dec_framebuffer_counters(&sctx->framebuffer.state);
+   util_copy_framebuffer_state(&sctx->framebuffer.state, state);
+
+   sctx->framebuffer.colorbuf_enabled_4bit = 0;
+   sctx->framebuffer.spi_shader_col_format = 0;
+   sctx->framebuffer.spi_shader_col_format_alpha = 0;
+   sctx->framebuffer.spi_shader_col_format_blend = 0;
+   sctx->framebuffer.spi_shader_col_format_blend_alpha = 0;
+   sctx->framebuffer.color_is_int8 = 0;
+   sctx->framebuffer.color_is_int10 = 0;
+
+   sctx->framebuffer.compressed_cb_mask = 0;
+   sctx->framebuffer.uncompressed_cb_mask = 0;
+   sctx->framebuffer.displayable_dcc_cb_mask = 0;
+   sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state);
+   sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples;
+   sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples);
+   sctx->framebuffer.any_dst_linear = false;
+   sctx->framebuffer.CB_has_shader_readable_metadata = false;
+   sctx->framebuffer.DB_has_shader_readable_metadata = false;
+   sctx->framebuffer.all_DCC_pipe_aligned = true;
+   sctx->framebuffer.min_bytes_per_pixel = 0;
+
+   for (i = 0; i < state->nr_cbufs; i++) {
+      if (!state->cbufs[i])
+         continue;
+
+      surf = (struct si_surface *)state->cbufs[i];
+      tex = (struct si_texture *)surf->base.texture;
+
+      if (!surf->color_initialized) {
+         si_initialize_color_surface(sctx, surf);
+      }
+
+      sctx->framebuffer.colorbuf_enabled_4bit |= 0xf << (i * 4);
+      sctx->framebuffer.spi_shader_col_format |= surf->spi_shader_col_format << (i * 4);
+      sctx->framebuffer.spi_shader_col_format_alpha |= surf->spi_shader_col_format_alpha << (i * 4);
+      sctx->framebuffer.spi_shader_col_format_blend |= surf->spi_shader_col_format_blend << (i * 4);
+      sctx->framebuffer.spi_shader_col_format_blend_alpha |= surf->spi_shader_col_format_blend_alpha
+                                                             << (i * 4);
+
+      if (surf->color_is_int8)
+         sctx->framebuffer.color_is_int8 |= 1 << i;
+      if (surf->color_is_int10)
+         sctx->framebuffer.color_is_int10 |= 1 << i;
+
+      if (tex->surface.fmask_offset)
+         sctx->framebuffer.compressed_cb_mask |= 1 << i;
+      else
+         sctx->framebuffer.uncompressed_cb_mask |= 1 << i;
+
+      if (tex->surface.dcc_offset)
+         sctx->framebuffer.displayable_dcc_cb_mask |= 1 << i;
+
+      /* Don't update nr_color_samples for non-AA buffers.
+       * (e.g. destination of MSAA resolve)
+       */
+      if (tex->buffer.b.b.nr_samples >= 2 &&
+          tex->buffer.b.b.nr_storage_samples < tex->buffer.b.b.nr_samples) {
+         sctx->framebuffer.nr_color_samples =
+            MIN2(sctx->framebuffer.nr_color_samples, tex->buffer.b.b.nr_storage_samples);
+         sctx->framebuffer.nr_color_samples = MAX2(1, sctx->framebuffer.nr_color_samples);
+      }
+
+      if (tex->surface.is_linear)
+         sctx->framebuffer.any_dst_linear = true;
+
+      if (vi_dcc_enabled(tex, surf->base.u.tex.level)) {
+         sctx->framebuffer.CB_has_shader_readable_metadata = true;
+
+         if (sctx->chip_class >= GFX9 && !tex->surface.u.gfx9.dcc.pipe_aligned)
+            sctx->framebuffer.all_DCC_pipe_aligned = false;
+      }
+
+      si_context_add_resource_size(sctx, surf->base.texture);
+
+      p_atomic_inc(&tex->framebuffers_bound);
+
+      if (tex->dcc_gather_statistics) {
+         /* Dirty tracking must be enabled for DCC usage analysis. */
+         sctx->framebuffer.compressed_cb_mask |= 1 << i;
+         vi_separate_dcc_start_query(sctx, tex);
+      }
+
+      /* Update the minimum but don't keep 0. */
+      if (!sctx->framebuffer.min_bytes_per_pixel ||
+          tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
+         sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe;
+   }
+
+   /* For optimal DCC performance. */
+   if (sctx->chip_class >= GFX10)
+      sctx->framebuffer.dcc_overwrite_combiner_watermark = 6;
+   else
+      sctx->framebuffer.dcc_overwrite_combiner_watermark = 4;
+
+   struct si_texture *zstex = NULL;
+
+   if (state->zsbuf) {
+      surf = (struct si_surface *)state->zsbuf;
+      zstex = (struct si_texture *)surf->base.texture;
+
+      if (!surf->depth_initialized) {
+         si_init_depth_surface(sctx, surf);
+      }
+
+      if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, PIPE_MASK_ZS))
+         sctx->framebuffer.DB_has_shader_readable_metadata = true;
+
+      si_context_add_resource_size(sctx, surf->base.texture);
+
+      /* Update the minimum but don't keep 0. */
+      if (!sctx->framebuffer.min_bytes_per_pixel ||
+          zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
+         sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe;
+   }
+
+   si_update_ps_colorbuf0_slot(sctx);
+   si_update_poly_offset_state(sctx);
+   si_update_ngg_small_prim_precision(sctx);
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+
+   if (sctx->screen->dpbb_allowed)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+
+   if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+
+   if (sctx->screen->has_out_of_order_rast &&
+       (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit ||
+        !!sctx->framebuffer.state.zsbuf != old_has_zsbuf ||
+        (zstex && zstex->surface.has_stencil != old_has_stencil)))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+
+   if (sctx->framebuffer.nr_samples != old_nr_samples) {
+      struct pipe_constant_buffer constbuf = {0};
+
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+
+      constbuf.buffer = sctx->sample_pos_buffer;
+
+      /* Set sample locations as fragment shader constants. */
+      switch (sctx->framebuffer.nr_samples) {
+      case 1:
+         constbuf.buffer_offset = 0;
+         break;
+      case 2:
+         constbuf.buffer_offset =
+            (ubyte *)sctx->sample_positions.x2 - (ubyte *)sctx->sample_positions.x1;
+         break;
+      case 4:
+         constbuf.buffer_offset =
+            (ubyte *)sctx->sample_positions.x4 - (ubyte *)sctx->sample_positions.x1;
+         break;
+      case 8:
+         constbuf.buffer_offset =
+            (ubyte *)sctx->sample_positions.x8 - (ubyte *)sctx->sample_positions.x1;
+         break;
+      case 16:
+         constbuf.buffer_offset =
+            (ubyte *)sctx->sample_positions.x16 - (ubyte *)sctx->sample_positions.x1;
+         break;
+      default:
+         PRINT_ERR("Requested an invalid number of samples %i.\n", sctx->framebuffer.nr_samples);
+         assert(0);
+      }
+      constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4;
+      si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf);
+
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
+   }
+
+   sctx->do_update_shaders = true;
+
+   if (!sctx->decompression_enabled) {
+      /* Prevent textures decompression when the framebuffer state
+       * changes come from the decompression passes themselves.
+       */
+      sctx->need_check_render_feedback = true;
+   }
  }
  
  static void si_emit_framebuffer_state(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
-       unsigned i, nr_cbufs = state->nr_cbufs;
-       struct si_texture *tex = NULL;
-       struct si_surface *cb = NULL;
-       unsigned cb_color_info = 0;
-
-       /* Colorbuffers. */
-       for (i = 0; i < nr_cbufs; i++) {
-               uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base;
-               unsigned cb_color_attrib;
-
-               if (!(sctx->framebuffer.dirty_cbufs & (1 << i)))
-                       continue;
-
-               cb = (struct si_surface*)state->cbufs[i];
-               if (!cb) {
-                       radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
-                                              S_028C70_FORMAT(V_028C70_COLOR_INVALID));
-                       continue;
-               }
-
-               tex = (struct si_texture *)cb->base.texture;
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                     &tex->buffer, RADEON_USAGE_READWRITE,
-                                     tex->buffer.b.b.nr_samples > 1 ?
-                                             RADEON_PRIO_COLOR_BUFFER_MSAA :
-                                             RADEON_PRIO_COLOR_BUFFER);
-
-               if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) {
-                       radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                               tex->cmask_buffer, RADEON_USAGE_READWRITE,
-                               RADEON_PRIO_SEPARATE_META);
-               }
-
-               if (tex->dcc_separate_buffer)
-                       radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                                 tex->dcc_separate_buffer,
-                                                 RADEON_USAGE_READWRITE,
-                                                 RADEON_PRIO_SEPARATE_META);
-
-               /* Compute mutable surface parameters. */
-               cb_color_base = tex->buffer.gpu_address >> 8;
-               cb_color_fmask = 0;
-               cb_color_cmask = tex->cmask_base_address_reg;
-               cb_dcc_base = 0;
-               cb_color_info = cb->cb_color_info | tex->cb_color_info;
-               cb_color_attrib = cb->cb_color_attrib;
-
-               if (cb->base.u.tex.level > 0)
-                       cb_color_info &= C_028C70_FAST_CLEAR;
-
-               if (tex->surface.fmask_offset) {
-                       cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8;
-                       cb_color_fmask |= tex->surface.fmask_tile_swizzle;
-               }
-
-               /* Set up DCC. */
-               if (vi_dcc_enabled(tex, cb->base.u.tex.level)) {
-                       bool is_msaa_resolve_dst = state->cbufs[0] &&
-                                                  state->cbufs[0]->texture->nr_samples > 1 &&
-                                                  state->cbufs[1] == &cb->base &&
-                                                  state->cbufs[1]->texture->nr_samples <= 1;
-
-                       if (!is_msaa_resolve_dst)
-                               cb_color_info |= S_028C70_DCC_ENABLE(1);
-
-                       cb_dcc_base = ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) +
-                                      tex->surface.dcc_offset) >> 8;
-
-                       unsigned dcc_tile_swizzle = tex->surface.tile_swizzle;
-                       dcc_tile_swizzle &= (tex->surface.dcc_alignment - 1) >> 8;
-                       cb_dcc_base |= dcc_tile_swizzle;
-               }
-
-               if (sctx->chip_class >= GFX10) {
-                       unsigned cb_color_attrib3;
-
-                       /* Set mutable surface parameters. */
-                       cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
-                       cb_color_base |= tex->surface.tile_swizzle;
-                       if (!tex->surface.fmask_offset)
-                               cb_color_fmask = cb_color_base;
-                       if (cb->base.u.tex.level > 0)
-                               cb_color_cmask = cb_color_base;
-
-                       cb_color_attrib3 = cb->cb_color_attrib3 |
-                                          S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
-                                          S_028EE0_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
-                                          S_028EE0_CMASK_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
-                                          S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.dcc.pipe_aligned);
-
-                       radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
-                       radeon_emit(cs, cb_color_base);         /* CB_COLOR0_BASE */
-                       radeon_emit(cs, 0);                     /* hole */
-                       radeon_emit(cs, 0);                     /* hole */
-                       radeon_emit(cs, cb->cb_color_view);     /* CB_COLOR0_VIEW */
-                       radeon_emit(cs, cb_color_info);         /* CB_COLOR0_INFO */
-                       radeon_emit(cs, cb_color_attrib);       /* CB_COLOR0_ATTRIB */
-                       radeon_emit(cs, cb->cb_dcc_control);    /* CB_COLOR0_DCC_CONTROL */
-                       radeon_emit(cs, cb_color_cmask);        /* CB_COLOR0_CMASK */
-                       radeon_emit(cs, 0);                     /* hole */
-                       radeon_emit(cs, cb_color_fmask);        /* CB_COLOR0_FMASK */
-                       radeon_emit(cs, 0);                     /* hole */
-                       radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
-                       radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
-                       radeon_emit(cs, cb_dcc_base);           /* CB_COLOR0_DCC_BASE */
-
-                       radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4,
-                                              cb_color_base >> 32);
-                       radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4,
-                                              cb_color_cmask >> 32);
-                       radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4,
-                                              cb_color_fmask >> 32);
-                       radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4,
-                                              cb_dcc_base >> 32);
-                       radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4,
-                                              cb->cb_color_attrib2);
-                       radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4,
-                                              cb_color_attrib3);
-               } else if (sctx->chip_class == GFX9) {
-                       struct gfx9_surf_meta_flags meta;
-
-                       if (tex->surface.dcc_offset)
-                               meta = tex->surface.u.gfx9.dcc;
-                       else
-                               meta = tex->surface.u.gfx9.cmask;
-
-                       /* Set mutable surface parameters. */
-                       cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
-                       cb_color_base |= tex->surface.tile_swizzle;
-                       if (!tex->surface.fmask_offset)
-                               cb_color_fmask = cb_color_base;
-                       if (cb->base.u.tex.level > 0)
-                               cb_color_cmask = cb_color_base;
-                       cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
-                                          S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
-                                          S_028C74_RB_ALIGNED(meta.rb_aligned) |
-                                          S_028C74_PIPE_ALIGNED(meta.pipe_aligned);
-
-                       radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
-                       radeon_emit(cs, cb_color_base);         /* CB_COLOR0_BASE */
-                       radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */
-                       radeon_emit(cs, cb->cb_color_attrib2);  /* CB_COLOR0_ATTRIB2 */
-                       radeon_emit(cs, cb->cb_color_view);     /* CB_COLOR0_VIEW */
-                       radeon_emit(cs, cb_color_info);         /* CB_COLOR0_INFO */
-                       radeon_emit(cs, cb_color_attrib);       /* CB_COLOR0_ATTRIB */
-                       radeon_emit(cs, cb->cb_dcc_control);    /* CB_COLOR0_DCC_CONTROL */
-                       radeon_emit(cs, cb_color_cmask);        /* CB_COLOR0_CMASK */
-                       radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
-                       radeon_emit(cs, cb_color_fmask);        /* CB_COLOR0_FMASK */
-                       radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
-                       radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
-                       radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
-                       radeon_emit(cs, cb_dcc_base);           /* CB_COLOR0_DCC_BASE */
-                       radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */
-
-                       radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4,
-                                              S_0287A0_EPITCH(tex->surface.u.gfx9.surf.epitch));
-               } else {
-                       /* Compute mutable surface parameters (GFX6-GFX8). */
-                       const struct legacy_surf_level *level_info =
-                               &tex->surface.u.legacy.level[cb->base.u.tex.level];
-                       unsigned pitch_tile_max, slice_tile_max, tile_mode_index;
-                       unsigned cb_color_pitch, cb_color_slice, cb_color_fmask_slice;
-
-                       cb_color_base += level_info->offset >> 8;
-                       /* Only macrotiled modes can set tile swizzle. */
-                       if (level_info->mode == RADEON_SURF_MODE_2D)
-                               cb_color_base |= tex->surface.tile_swizzle;
-
-                       if (!tex->surface.fmask_offset)
-                               cb_color_fmask = cb_color_base;
-                       if (cb->base.u.tex.level > 0)
-                               cb_color_cmask = cb_color_base;
-                       if (cb_dcc_base)
-                               cb_dcc_base += level_info->dcc_offset >> 8;
-
-                       pitch_tile_max = level_info->nblk_x / 8 - 1;
-                       slice_tile_max = level_info->nblk_x *
-                                        level_info->nblk_y / 64 - 1;
-                       tile_mode_index = si_tile_mode_index(tex, cb->base.u.tex.level, false);
-
-                       cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index);
-                       cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max);
-                       cb_color_slice = S_028C68_TILE_MAX(slice_tile_max);
-
-                       if (tex->surface.fmask_offset) {
-                               if (sctx->chip_class >= GFX7)
-                                       cb_color_pitch |= S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1);
-                               cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index);
-                               cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.fmask.slice_tile_max);
-                       } else {
-                               /* This must be set for fast clear to work without FMASK. */
-                               if (sctx->chip_class >= GFX7)
-                                       cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max);
-                               cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index);
-                               cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max);
-                       }
-
-                       radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
-                                                  sctx->chip_class >= GFX8 ? 14 : 13);
-                       radeon_emit(cs, cb_color_base);         /* CB_COLOR0_BASE */
-                       radeon_emit(cs, cb_color_pitch);        /* CB_COLOR0_PITCH */
-                       radeon_emit(cs, cb_color_slice);        /* CB_COLOR0_SLICE */
-                       radeon_emit(cs, cb->cb_color_view);     /* CB_COLOR0_VIEW */
-                       radeon_emit(cs, cb_color_info);         /* CB_COLOR0_INFO */
-                       radeon_emit(cs, cb_color_attrib);       /* CB_COLOR0_ATTRIB */
-                       radeon_emit(cs, cb->cb_dcc_control);    /* CB_COLOR0_DCC_CONTROL */
-                       radeon_emit(cs, cb_color_cmask);        /* CB_COLOR0_CMASK */
-                       radeon_emit(cs, tex->surface.u.legacy.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
-                       radeon_emit(cs, cb_color_fmask);                /* CB_COLOR0_FMASK */
-                       radeon_emit(cs, cb_color_fmask_slice);          /* CB_COLOR0_FMASK_SLICE */
-                       radeon_emit(cs, tex->color_clear_value[0]);     /* CB_COLOR0_CLEAR_WORD0 */
-                       radeon_emit(cs, tex->color_clear_value[1]);     /* CB_COLOR0_CLEAR_WORD1 */
-
-                       if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */
-                               radeon_emit(cs, cb_dcc_base);
-               }
-       }
-       for (; i < 8 ; i++)
-               if (sctx->framebuffer.dirty_cbufs & (1 << i))
-                       radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
-
-       /* ZS buffer. */
-       if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
-               struct si_surface *zb = (struct si_surface*)state->zsbuf;
-               struct si_texture *tex = (struct si_texture*)zb->base.texture;
-
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                     &tex->buffer, RADEON_USAGE_READWRITE,
-                                     zb->base.texture->nr_samples > 1 ?
-                                             RADEON_PRIO_DEPTH_BUFFER_MSAA :
-                                             RADEON_PRIO_DEPTH_BUFFER);
-
-               if (sctx->chip_class >= GFX10) {
-                       radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
-                       radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
-
-                       radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7);
-                       radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1));    /* DB_DEPTH_INFO */
-                       radeon_emit(cs, zb->db_z_info |                 /* DB_Z_INFO */
-                                   S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
-                       radeon_emit(cs, zb->db_stencil_info);           /* DB_STENCIL_INFO */
-                       radeon_emit(cs, zb->db_depth_base);             /* DB_Z_READ_BASE */
-                       radeon_emit(cs, zb->db_stencil_base);           /* DB_STENCIL_READ_BASE */
-                       radeon_emit(cs, zb->db_depth_base);             /* DB_Z_WRITE_BASE */
-                       radeon_emit(cs, zb->db_stencil_base);           /* DB_STENCIL_WRITE_BASE */
-
-                       radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5);
-                       radeon_emit(cs, zb->db_depth_base >> 32);       /* DB_Z_READ_BASE_HI */
-                       radeon_emit(cs, zb->db_stencil_base >> 32);     /* DB_STENCIL_READ_BASE_HI */
-                       radeon_emit(cs, zb->db_depth_base >> 32);       /* DB_Z_WRITE_BASE_HI */
-                       radeon_emit(cs, zb->db_stencil_base >> 32);     /* DB_STENCIL_WRITE_BASE_HI */
-                       radeon_emit(cs, zb->db_htile_data_base >> 32);  /* DB_HTILE_DATA_BASE_HI */
-               } else if (sctx->chip_class == GFX9) {
-                       radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3);
-                       radeon_emit(cs, zb->db_htile_data_base);        /* DB_HTILE_DATA_BASE */
-                       radeon_emit(cs, S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
-                       radeon_emit(cs, zb->db_depth_size);             /* DB_DEPTH_SIZE */
-
-                       radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10);
-                       radeon_emit(cs, zb->db_z_info |                 /* DB_Z_INFO */
-                                   S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
-                       radeon_emit(cs, zb->db_stencil_info);           /* DB_STENCIL_INFO */
-                       radeon_emit(cs, zb->db_depth_base);             /* DB_Z_READ_BASE */
-                       radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */
-                       radeon_emit(cs, zb->db_stencil_base);           /* DB_STENCIL_READ_BASE */
-                       radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
-                       radeon_emit(cs, zb->db_depth_base);             /* DB_Z_WRITE_BASE */
-                       radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */
-                       radeon_emit(cs, zb->db_stencil_base);           /* DB_STENCIL_WRITE_BASE */
-                       radeon_emit(cs, S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
-
-                       radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2);
-                       radeon_emit(cs, zb->db_z_info2);        /* DB_Z_INFO2 */
-                       radeon_emit(cs, zb->db_stencil_info2);  /* DB_STENCIL_INFO2 */
-               } else {
-                       radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
-
-                       radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9);
-                       radeon_emit(cs, zb->db_depth_info);     /* DB_DEPTH_INFO */
-                       radeon_emit(cs, zb->db_z_info |         /* DB_Z_INFO */
-                                   S_028040_ZRANGE_PRECISION(tex->depth_clear_value != 0));
-                       radeon_emit(cs, zb->db_stencil_info);   /* DB_STENCIL_INFO */
-                       radeon_emit(cs, zb->db_depth_base);     /* DB_Z_READ_BASE */
-                       radeon_emit(cs, zb->db_stencil_base);   /* DB_STENCIL_READ_BASE */
-                       radeon_emit(cs, zb->db_depth_base);     /* DB_Z_WRITE_BASE */
-                       radeon_emit(cs, zb->db_stencil_base);   /* DB_STENCIL_WRITE_BASE */
-                       radeon_emit(cs, zb->db_depth_size);     /* DB_DEPTH_SIZE */
-                       radeon_emit(cs, zb->db_depth_slice);    /* DB_DEPTH_SLICE */
-               }
-
-               radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
-               radeon_emit(cs, tex->stencil_clear_value); /* R_028028_DB_STENCIL_CLEAR */
-               radeon_emit(cs, fui(tex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */
-
-               radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
-               radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface);
-       } else if (sctx->framebuffer.dirty_zsbuf) {
-               if (sctx->chip_class == GFX9)
-                       radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2);
-               else
-                       radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2);
-
-               radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
-               radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
-       }
-
-       /* Framebuffer dimensions. */
-        /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_config() */
-       radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
-                              S_028208_BR_X(state->width) | S_028208_BR_Y(state->height));
-
-       if (sctx->screen->dfsm_allowed) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
-       }
-
-       sctx->framebuffer.dirty_cbufs = 0;
-       sctx->framebuffer.dirty_zsbuf = false;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
+   unsigned i, nr_cbufs = state->nr_cbufs;
+   struct si_texture *tex = NULL;
+   struct si_surface *cb = NULL;
+   unsigned cb_color_info = 0;
+
+   /* Colorbuffers. */
+   for (i = 0; i < nr_cbufs; i++) {
+      uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base;
+      unsigned cb_color_attrib;
+
+      if (!(sctx->framebuffer.dirty_cbufs & (1 << i)))
+         continue;
+
+      cb = (struct si_surface *)state->cbufs[i];
+      if (!cb) {
+         radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
+                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
+         continue;
+      }
+
+      tex = (struct si_texture *)cb->base.texture;
+      radeon_add_to_buffer_list(
+         sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE,
+         tex->buffer.b.b.nr_samples > 1 ? RADEON_PRIO_COLOR_BUFFER_MSAA : RADEON_PRIO_COLOR_BUFFER);
+
+      if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) {
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, tex->cmask_buffer, RADEON_USAGE_READWRITE,
+                                   RADEON_PRIO_SEPARATE_META);
+      }
+
+      if (tex->dcc_separate_buffer)
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, tex->dcc_separate_buffer,
+                                   RADEON_USAGE_READWRITE, RADEON_PRIO_SEPARATE_META);
+
+      /* Compute mutable surface parameters. */
+      cb_color_base = tex->buffer.gpu_address >> 8;
+      cb_color_fmask = 0;
+      cb_color_cmask = tex->cmask_base_address_reg;
+      cb_dcc_base = 0;
+      cb_color_info = cb->cb_color_info | tex->cb_color_info;
+      cb_color_attrib = cb->cb_color_attrib;
+
+      if (cb->base.u.tex.level > 0)
+         cb_color_info &= C_028C70_FAST_CLEAR;
+
+      if (tex->surface.fmask_offset) {
+         cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8;
+         cb_color_fmask |= tex->surface.fmask_tile_swizzle;
+      }
+
+      /* Set up DCC. */
+      if (vi_dcc_enabled(tex, cb->base.u.tex.level)) {
+         bool is_msaa_resolve_dst = state->cbufs[0] && state->cbufs[0]->texture->nr_samples > 1 &&
+                                    state->cbufs[1] == &cb->base &&
+                                    state->cbufs[1]->texture->nr_samples <= 1;
+
+         if (!is_msaa_resolve_dst)
+            cb_color_info |= S_028C70_DCC_ENABLE(1);
+
+         cb_dcc_base =
+            ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.dcc_offset) >>
+            8;
+
+         unsigned dcc_tile_swizzle = tex->surface.tile_swizzle;
+         dcc_tile_swizzle &= (tex->surface.dcc_alignment - 1) >> 8;
+         cb_dcc_base |= dcc_tile_swizzle;
+      }
+
+      if (sctx->chip_class >= GFX10) {
+         unsigned cb_color_attrib3;
+
+         /* Set mutable surface parameters. */
+         cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
+         cb_color_base |= tex->surface.tile_swizzle;
+         if (!tex->surface.fmask_offset)
+            cb_color_fmask = cb_color_base;
+         if (cb->base.u.tex.level > 0)
+            cb_color_cmask = cb_color_base;
+
+         cb_color_attrib3 = cb->cb_color_attrib3 |
+                            S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
+                            S_028EE0_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
+                            S_028EE0_CMASK_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
+                            S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.dcc.pipe_aligned);
+
+         radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
+         radeon_emit(cs, cb_color_base);             /* CB_COLOR0_BASE */
+         radeon_emit(cs, 0);                         /* hole */
+         radeon_emit(cs, 0);                         /* hole */
+         radeon_emit(cs, cb->cb_color_view);         /* CB_COLOR0_VIEW */
+         radeon_emit(cs, cb_color_info);             /* CB_COLOR0_INFO */
+         radeon_emit(cs, cb_color_attrib);           /* CB_COLOR0_ATTRIB */
+         radeon_emit(cs, cb->cb_dcc_control);        /* CB_COLOR0_DCC_CONTROL */
+         radeon_emit(cs, cb_color_cmask);            /* CB_COLOR0_CMASK */
+         radeon_emit(cs, 0);                         /* hole */
+         radeon_emit(cs, cb_color_fmask);            /* CB_COLOR0_FMASK */
+         radeon_emit(cs, 0);                         /* hole */
+         radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
+         radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
+         radeon_emit(cs, cb_dcc_base);               /* CB_COLOR0_DCC_BASE */
+
+         radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32);
+         radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4,
+                                cb_color_cmask >> 32);
+         radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4,
+                                cb_color_fmask >> 32);
+         radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32);
+         radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2);
+         radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3);
+      } else if (sctx->chip_class == GFX9) {
+         struct gfx9_surf_meta_flags meta;
+
+         if (tex->surface.dcc_offset)
+            meta = tex->surface.u.gfx9.dcc;
+         else
+            meta = tex->surface.u.gfx9.cmask;
+
+         /* Set mutable surface parameters. */
+         cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
+         cb_color_base |= tex->surface.tile_swizzle;
+         if (!tex->surface.fmask_offset)
+            cb_color_fmask = cb_color_base;
+         if (cb->base.u.tex.level > 0)
+            cb_color_cmask = cb_color_base;
+         cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
+                            S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
+                            S_028C74_RB_ALIGNED(meta.rb_aligned) |
+                            S_028C74_PIPE_ALIGNED(meta.pipe_aligned);
+
+         radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
+         radeon_emit(cs, cb_color_base);                            /* CB_COLOR0_BASE */
+         radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32));  /* CB_COLOR0_BASE_EXT */
+         radeon_emit(cs, cb->cb_color_attrib2);                     /* CB_COLOR0_ATTRIB2 */
+         radeon_emit(cs, cb->cb_color_view);                        /* CB_COLOR0_VIEW */
+         radeon_emit(cs, cb_color_info);                            /* CB_COLOR0_INFO */
+         radeon_emit(cs, cb_color_attrib);                          /* CB_COLOR0_ATTRIB */
+         radeon_emit(cs, cb->cb_dcc_control);                       /* CB_COLOR0_DCC_CONTROL */
+         radeon_emit(cs, cb_color_cmask);                           /* CB_COLOR0_CMASK */
+         radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
+         radeon_emit(cs, cb_color_fmask);                           /* CB_COLOR0_FMASK */
+         radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
+         radeon_emit(cs, tex->color_clear_value[0]);                /* CB_COLOR0_CLEAR_WORD0 */
+         radeon_emit(cs, tex->color_clear_value[1]);                /* CB_COLOR0_CLEAR_WORD1 */
+         radeon_emit(cs, cb_dcc_base);                              /* CB_COLOR0_DCC_BASE */
+         radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32));    /* CB_COLOR0_DCC_BASE_EXT */
+
+         radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4,
+                                S_0287A0_EPITCH(tex->surface.u.gfx9.surf.epitch));
+      } else {
+         /* Compute mutable surface parameters (GFX6-GFX8). */
+         const struct legacy_surf_level *level_info =
+            &tex->surface.u.legacy.level[cb->base.u.tex.level];
+         unsigned pitch_tile_max, slice_tile_max, tile_mode_index;
+         unsigned cb_color_pitch, cb_color_slice, cb_color_fmask_slice;
+
+         cb_color_base += level_info->offset >> 8;
+         /* Only macrotiled modes can set tile swizzle. */
+         if (level_info->mode == RADEON_SURF_MODE_2D)
+            cb_color_base |= tex->surface.tile_swizzle;
+
+         if (!tex->surface.fmask_offset)
+            cb_color_fmask = cb_color_base;
+         if (cb->base.u.tex.level > 0)
+            cb_color_cmask = cb_color_base;
+         if (cb_dcc_base)
+            cb_dcc_base += level_info->dcc_offset >> 8;
+
+         pitch_tile_max = level_info->nblk_x / 8 - 1;
+         slice_tile_max = level_info->nblk_x * level_info->nblk_y / 64 - 1;
+         tile_mode_index = si_tile_mode_index(tex, cb->base.u.tex.level, false);
+
+         cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index);
+         cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max);
+         cb_color_slice = S_028C68_TILE_MAX(slice_tile_max);
+
+         if (tex->surface.fmask_offset) {
+            if (sctx->chip_class >= GFX7)
+               cb_color_pitch |=
+                  S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1);
+            cb_color_attrib |=
+               S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index);
+            cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.fmask.slice_tile_max);
+         } else {
+            /* This must be set for fast clear to work without FMASK. */
+            if (sctx->chip_class >= GFX7)
+               cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max);
+            cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index);
+            cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max);
+         }
+
+         radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
+                                    sctx->chip_class >= GFX8 ? 14 : 13);
+         radeon_emit(cs, cb_color_base);                              /* CB_COLOR0_BASE */
+         radeon_emit(cs, cb_color_pitch);                             /* CB_COLOR0_PITCH */
+         radeon_emit(cs, cb_color_slice);                             /* CB_COLOR0_SLICE */
+         radeon_emit(cs, cb->cb_color_view);                          /* CB_COLOR0_VIEW */
+         radeon_emit(cs, cb_color_info);                              /* CB_COLOR0_INFO */
+         radeon_emit(cs, cb_color_attrib);                            /* CB_COLOR0_ATTRIB */
+         radeon_emit(cs, cb->cb_dcc_control);                         /* CB_COLOR0_DCC_CONTROL */
+         radeon_emit(cs, cb_color_cmask);                             /* CB_COLOR0_CMASK */
+         radeon_emit(cs, tex->surface.u.legacy.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
+         radeon_emit(cs, cb_color_fmask);                             /* CB_COLOR0_FMASK */
+         radeon_emit(cs, cb_color_fmask_slice);                       /* CB_COLOR0_FMASK_SLICE */
+         radeon_emit(cs, tex->color_clear_value[0]);                  /* CB_COLOR0_CLEAR_WORD0 */
+         radeon_emit(cs, tex->color_clear_value[1]);                  /* CB_COLOR0_CLEAR_WORD1 */
+
+         if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */
+            radeon_emit(cs, cb_dcc_base);
+      }
+   }
+   for (; i < 8; i++)
+      if (sctx->framebuffer.dirty_cbufs & (1 << i))
+         radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
+
+   /* ZS buffer. */
+   if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
+      struct si_surface *zb = (struct si_surface *)state->zsbuf;
+      struct si_texture *tex = (struct si_texture *)zb->base.texture;
+
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE,
+                                zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA
+                                                                 : RADEON_PRIO_DEPTH_BUFFER);
+
+      if (sctx->chip_class >= GFX10) {
+         radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
+         radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
+
+         radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7);
+         radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
+         radeon_emit(cs, zb->db_z_info |              /* DB_Z_INFO */
+                            S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
+         radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
+         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_READ_BASE */
+         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
+         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_WRITE_BASE */
+         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
+
+         radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5);
+         radeon_emit(cs, zb->db_depth_base >> 32);      /* DB_Z_READ_BASE_HI */
+         radeon_emit(cs, zb->db_stencil_base >> 32);    /* DB_STENCIL_READ_BASE_HI */
+         radeon_emit(cs, zb->db_depth_base >> 32);      /* DB_Z_WRITE_BASE_HI */
+         radeon_emit(cs, zb->db_stencil_base >> 32);    /* DB_STENCIL_WRITE_BASE_HI */
+         radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */
+      } else if (sctx->chip_class == GFX9) {
+         radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3);
+         radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */
+         radeon_emit(cs,
+                     S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
+         radeon_emit(cs, zb->db_depth_size);                          /* DB_DEPTH_SIZE */
+
+         radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10);
+         radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */
+                            S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
+         radeon_emit(cs, zb->db_stencil_info);                         /* DB_STENCIL_INFO */
+         radeon_emit(cs, zb->db_depth_base);                           /* DB_Z_READ_BASE */
+         radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_READ_BASE_HI */
+         radeon_emit(cs, zb->db_stencil_base);                         /* DB_STENCIL_READ_BASE */
+         radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
+         radeon_emit(cs, zb->db_depth_base);                           /* DB_Z_WRITE_BASE */
+         radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_WRITE_BASE_HI */
+         radeon_emit(cs, zb->db_stencil_base);                         /* DB_STENCIL_WRITE_BASE */
+         radeon_emit(cs,
+                     S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
+
+         radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2);
+         radeon_emit(cs, zb->db_z_info2);       /* DB_Z_INFO2 */
+         radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */
+      } else {
+         radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
+
+         radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9);
+         radeon_emit(cs, zb->db_depth_info); /* DB_DEPTH_INFO */
+         radeon_emit(cs, zb->db_z_info |     /* DB_Z_INFO */
+                            S_028040_ZRANGE_PRECISION(tex->depth_clear_value != 0));
+         radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
+         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_READ_BASE */
+         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
+         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_WRITE_BASE */
+         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
+         radeon_emit(cs, zb->db_depth_size);   /* DB_DEPTH_SIZE */
+         radeon_emit(cs, zb->db_depth_slice);  /* DB_DEPTH_SLICE */
+      }
+
+      radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
+      radeon_emit(cs, tex->stencil_clear_value);    /* R_028028_DB_STENCIL_CLEAR */
+      radeon_emit(cs, fui(tex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */
+
+      radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
+      radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface);
+   } else if (sctx->framebuffer.dirty_zsbuf) {
+      if (sctx->chip_class == GFX9)
+         radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2);
+      else
+         radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2);
+
+      radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID));       /* DB_Z_INFO */
+      radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
+   }
+
+   /* Framebuffer dimensions. */
+   /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_config() */
+   radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
+                          S_028208_BR_X(state->width) | S_028208_BR_Y(state->height));
+
+   if (sctx->screen->dfsm_allowed) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+   }
+
+   sctx->framebuffer.dirty_cbufs = 0;
+   sctx->framebuffer.dirty_zsbuf = false;
  }
  
  static void si_emit_msaa_sample_locs(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-       unsigned nr_samples = sctx->framebuffer.nr_samples;
-       bool has_msaa_sample_loc_bug = sctx->screen->info.has_msaa_sample_loc_bug;
-
-       /* Smoothing (only possible with nr_samples == 1) uses the same
-        * sample locations as the MSAA it simulates.
-        */
-       if (nr_samples <= 1 && sctx->smoothing_enabled)
-               nr_samples = SI_NUM_SMOOTH_AA_SAMPLES;
-
-       /* On Polaris, the small primitive filter uses the sample locations
-        * even when MSAA is off, so we need to make sure they're set to 0.
-        *
-        * GFX10 uses sample locations unconditionally, so they always need
-        * to be set up.
-        */
-       if ((nr_samples >= 2 || has_msaa_sample_loc_bug ||
-            sctx->chip_class >= GFX10) &&
-           nr_samples != sctx->sample_locs_num_samples) {
-               sctx->sample_locs_num_samples = nr_samples;
-               si_emit_sample_locations(cs, nr_samples);
-       }
-
-       if (sctx->family >= CHIP_POLARIS10) {
-               unsigned small_prim_filter_cntl =
-                       S_028830_SMALL_PRIM_FILTER_ENABLE(1) |
-                       /* line bug */
-                       S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12);
-
-               /* The alternative of setting sample locations to 0 would
-                * require a DB flush to avoid Z errors, see
-                * https://bugs.freedesktop.org/show_bug.cgi?id=96908
-                */
-               if (has_msaa_sample_loc_bug &&
-                   sctx->framebuffer.nr_samples > 1 &&
-                   !rs->multisample_enable)
-                       small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE;
-
-               radeon_opt_set_context_reg(sctx,
-                                          R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
-                                          SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
-                                          small_prim_filter_cntl);
-       }
-
-       /* The exclusion bits can be set to improve rasterization efficiency
-        * if no sample lies on the pixel boundary (-8 sample offset).
-        */
-       bool exclusion = sctx->chip_class >= GFX7 &&
-                        (!rs->multisample_enable || nr_samples != 16);
-       radeon_opt_set_context_reg(sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL,
-                                  SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
-                                  S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) |
-                                  S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   unsigned nr_samples = sctx->framebuffer.nr_samples;
+   bool has_msaa_sample_loc_bug = sctx->screen->info.has_msaa_sample_loc_bug;
+
+   /* Smoothing (only possible with nr_samples == 1) uses the same
+    * sample locations as the MSAA it simulates.
+    */
+   if (nr_samples <= 1 && sctx->smoothing_enabled)
+      nr_samples = SI_NUM_SMOOTH_AA_SAMPLES;
+
+   /* On Polaris, the small primitive filter uses the sample locations
+    * even when MSAA is off, so we need to make sure they're set to 0.
+    *
+    * GFX10 uses sample locations unconditionally, so they always need
+    * to be set up.
+    */
+   if ((nr_samples >= 2 || has_msaa_sample_loc_bug || sctx->chip_class >= GFX10) &&
+       nr_samples != sctx->sample_locs_num_samples) {
+      sctx->sample_locs_num_samples = nr_samples;
+      si_emit_sample_locations(cs, nr_samples);
+   }
+
+   if (sctx->family >= CHIP_POLARIS10) {
+      unsigned small_prim_filter_cntl =
+         S_028830_SMALL_PRIM_FILTER_ENABLE(1) |
+         /* line bug */
+         S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12);
+
+      /* The alternative of setting sample locations to 0 would
+       * require a DB flush to avoid Z errors, see
+       * https://bugs.freedesktop.org/show_bug.cgi?id=96908
+       */
+      if (has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1 && !rs->multisample_enable)
+         small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE;
+
+      radeon_opt_set_context_reg(sctx, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
+                                 SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL, small_prim_filter_cntl);
+   }
+
+   /* The exclusion bits can be set to improve rasterization efficiency
+    * if no sample lies on the pixel boundary (-8 sample offset).
+    */
+   bool exclusion = sctx->chip_class >= GFX7 && (!rs->multisample_enable || nr_samples != 16);
+   radeon_opt_set_context_reg(
+      sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL, SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
+      S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
  }
  
  static bool si_out_of_order_rasterization(struct si_context *sctx)
  {
-       struct si_state_blend *blend = sctx->queued.named.blend;
-       struct si_state_dsa *dsa = sctx->queued.named.dsa;
+   struct si_state_blend *blend = sctx->queued.named.blend;
+   struct si_state_dsa *dsa = sctx->queued.named.dsa;
  
-       if (!sctx->screen->has_out_of_order_rast)
-               return false;
+   if (!sctx->screen->has_out_of_order_rast)
+      return false;
  
-       unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit;
+   unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit;
  
-       colormask &= blend->cb_target_enabled_4bit;
+   colormask &= blend->cb_target_enabled_4bit;
  
-       /* Conservative: No logic op. */
-       if (colormask && blend->logicop_enable)
-               return false;
+   /* Conservative: No logic op. */
+   if (colormask && blend->logicop_enable)
+      return false;
  
-       struct si_dsa_order_invariance dsa_order_invariant = {
-               .zs = true, .pass_set = true, .pass_last = false
-       };
+   struct si_dsa_order_invariance dsa_order_invariant = {.zs = true,
+                                                         .pass_set = true,
+                                                         .pass_last = false};
  
-       if (sctx->framebuffer.state.zsbuf) {
-               struct si_texture *zstex =
-                       (struct si_texture*)sctx->framebuffer.state.zsbuf->texture;
-               bool has_stencil = zstex->surface.has_stencil;
-               dsa_order_invariant = dsa->order_invariance[has_stencil];
-               if (!dsa_order_invariant.zs)
-                       return false;
+   if (sctx->framebuffer.state.zsbuf) {
+      struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
+      bool has_stencil = zstex->surface.has_stencil;
+      dsa_order_invariant = dsa->order_invariance[has_stencil];
+      if (!dsa_order_invariant.zs)
+         return false;
  
-               /* The set of PS invocations is always order invariant,
-                * except when early Z/S tests are requested. */
-               if (sctx->ps_shader.cso &&
-                   sctx->ps_shader.cso->info.writes_memory &&
-                   sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] &&
-                   !dsa_order_invariant.pass_set)
-                       return false;
+      /* The set of PS invocations is always order invariant,
+       * except when early Z/S tests are requested. */
+      if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.writes_memory &&
+          sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] &&
+          !dsa_order_invariant.pass_set)
+         return false;
  
-               if (sctx->num_perfect_occlusion_queries != 0 &&
-                   !dsa_order_invariant.pass_set)
-                       return false;
-       }
+      if (sctx->num_perfect_occlusion_queries != 0 && !dsa_order_invariant.pass_set)
+         return false;
+   }
  
-       if (!colormask)
-               return true;
+   if (!colormask)
+      return true;
  
-       unsigned blendmask = colormask & blend->blend_enable_4bit;
+   unsigned blendmask = colormask & blend->blend_enable_4bit;
  
-       if (blendmask) {
-               /* Only commutative blending. */
-               if (blendmask & ~blend->commutative_4bit)
-                       return false;
+   if (blendmask) {
+      /* Only commutative blending. */
+      if (blendmask & ~blend->commutative_4bit)
+         return false;
  
-               if (!dsa_order_invariant.pass_set)
-                       return false;
-       }
+      if (!dsa_order_invariant.pass_set)
+         return false;
+   }
  
-       if (colormask & ~blendmask) {
-               if (!dsa_order_invariant.pass_last)
-                       return false;
-       }
+   if (colormask & ~blendmask) {
+      if (!dsa_order_invariant.pass_last)
+         return false;
+   }
  
-       return true;
+   return true;
  }
  
  static void si_emit_msaa_config(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes;
-       /* 33% faster rendering to linear color buffers */
-       bool dst_is_linear = sctx->framebuffer.any_dst_linear;
-       bool out_of_order_rast = si_out_of_order_rasterization(sctx);
-       unsigned sc_mode_cntl_1 =
-               S_028A4C_WALK_SIZE(dst_is_linear) |
-               S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) |
-               S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
-               S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
-               S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
-               /* always 1: */
-               S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) |
-               S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
-               S_028A4C_TILE_WALK_ORDER_ENABLE(1) |
-               S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
-               S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
-               S_028A4C_FORCE_EOV_REZ_ENABLE(1);
-       unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
-                          S_028804_INCOHERENT_EQAA_READS(1) |
-                          S_028804_INTERPOLATE_COMP_Z(1) |
-                          S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
-       unsigned coverage_samples, color_samples, z_samples;
-       struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-
-       /* S: Coverage samples (up to 16x):
-        * - Scan conversion samples (PA_SC_AA_CONFIG.MSAA_NUM_SAMPLES)
-        * - CB FMASK samples (CB_COLORi_ATTRIB.NUM_SAMPLES)
-        *
-        * Z: Z/S samples (up to 8x, must be <= coverage samples and >= color samples):
-        * - Value seen by DB (DB_Z_INFO.NUM_SAMPLES)
-        * - Value seen by CB, must be correct even if Z/S is unbound (DB_EQAA.MAX_ANCHOR_SAMPLES)
-        * # Missing samples are derived from Z planes if Z is compressed (up to 16x quality), or
-        * # from the closest defined sample if Z is uncompressed (same quality as the number of
-        * # Z samples).
-        *
-        * F: Color samples (up to 8x, must be <= coverage samples):
-        * - CB color samples (CB_COLORi_ATTRIB.NUM_FRAGMENTS)
-        * - PS iter samples (DB_EQAA.PS_ITER_SAMPLES)
-        *
-        * Can be anything between coverage and color samples:
-        * - SampleMaskIn samples (PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES)
-        * - SampleMaskOut samples (DB_EQAA.MASK_EXPORT_NUM_SAMPLES)
-        * - Alpha-to-coverage samples (DB_EQAA.ALPHA_TO_MASK_NUM_SAMPLES)
-        * - Occlusion query samples (DB_COUNT_CONTROL.SAMPLE_RATE)
-        * # All are currently set the same as coverage samples.
-        *
-        * If color samples < coverage samples, FMASK has a higher bpp to store an "unknown"
-        * flag for undefined color samples. A shader-based resolve must handle unknowns
-        * or mask them out with AND. Unknowns can also be guessed from neighbors via
-        * an edge-detect shader-based resolve, which is required to make "color samples = 1"
-        * useful. The CB resolve always drops unknowns.
-        *
-        * Sensible AA configurations:
-        *   EQAA 16s 8z 8f - might look the same as 16x MSAA if Z is compressed
-        *   EQAA 16s 8z 4f - might look the same as 16x MSAA if Z is compressed
-        *   EQAA 16s 4z 4f - might look the same as 16x MSAA if Z is compressed
-        *   EQAA  8s 8z 8f = 8x MSAA
-        *   EQAA  8s 8z 4f - might look the same as 8x MSAA
-        *   EQAA  8s 8z 2f - might look the same as 8x MSAA with low-density geometry
-        *   EQAA  8s 4z 4f - might look the same as 8x MSAA if Z is compressed
-        *   EQAA  8s 4z 2f - might look the same as 8x MSAA with low-density geometry if Z is compressed
-        *   EQAA  4s 4z 4f = 4x MSAA
-        *   EQAA  4s 4z 2f - might look the same as 4x MSAA with low-density geometry
-        *   EQAA  2s 2z 2f = 2x MSAA
-        */
-       if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) {
-               coverage_samples = sctx->framebuffer.nr_samples;
-               color_samples = sctx->framebuffer.nr_color_samples;
-
-               if (sctx->framebuffer.state.zsbuf) {
-                       z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples;
-                       z_samples = MAX2(1, z_samples);
-               } else {
-                       z_samples = coverage_samples;
-               }
-       } else if (sctx->smoothing_enabled) {
-               coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES;
-       } else {
-               coverage_samples = color_samples = z_samples = 1;
-       }
-
-       /* Required by OpenGL line rasterization.
-        *
-        * TODO: We should also enable perpendicular endcaps for AA lines,
-        *       but that requires implementing line stippling in the pixel
-        *       shader. SC can only do line stippling with axis-aligned
-        *       endcaps.
-        */
-       unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
-       unsigned sc_aa_config = 0;
-
-       if (coverage_samples > 1) {
-               /* distance from the pixel center, indexed by log2(nr_samples) */
-               static unsigned max_dist[] = {
-                       0, /* unused */
-                       4, /* 2x MSAA */
-                       6, /* 4x MSAA */
-                       7, /* 8x MSAA */
-                       8, /* 16x MSAA */
-               };
-               unsigned log_samples = util_logbase2(coverage_samples);
-               unsigned log_z_samples = util_logbase2(z_samples);
-               unsigned ps_iter_samples = si_get_ps_iter_samples(sctx);
-               unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
-
-               sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1);
-               sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
-                              S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
-                              S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples);
-
-               if (sctx->framebuffer.nr_samples > 1) {
-                       db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
-                                  S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
-                                  S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
-                                  S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
-                       sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
-               } else if (sctx->smoothing_enabled) {
-                       db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
-               }
-       }
-
-       unsigned initial_cdw = cs->current.cdw;
-
-       /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */
-       radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL,
-                                   SI_TRACKED_PA_SC_LINE_CNTL, sc_line_cntl,
-                                   sc_aa_config);
-       /* R_028804_DB_EQAA */
-       radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA,
-                                  db_eqaa);
-       /* R_028A4C_PA_SC_MODE_CNTL_1 */
-       radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1,
-                                  SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1);
-
-       if (initial_cdw != cs->current.cdw) {
-               sctx->context_roll = true;
-
-               /* GFX9: Flush DFSM when the AA mode changes. */
-               if (sctx->screen->dfsm_allowed) {
-                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-                       radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
-               }
-       }
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes;
+   /* 33% faster rendering to linear color buffers */
+   bool dst_is_linear = sctx->framebuffer.any_dst_linear;
+   bool out_of_order_rast = si_out_of_order_rasterization(sctx);
+   unsigned sc_mode_cntl_1 =
+      S_028A4C_WALK_SIZE(dst_is_linear) | S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) |
+      S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
+      S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
+      S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
+      /* always 1: */
+      S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
+      S_028A4C_TILE_WALK_ORDER_ENABLE(1) | S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
+      S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | S_028A4C_FORCE_EOV_REZ_ENABLE(1);
+   unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) |
+                      S_028804_INTERPOLATE_COMP_Z(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
+   unsigned coverage_samples, color_samples, z_samples;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+
+   /* S: Coverage samples (up to 16x):
+    * - Scan conversion samples (PA_SC_AA_CONFIG.MSAA_NUM_SAMPLES)
+    * - CB FMASK samples (CB_COLORi_ATTRIB.NUM_SAMPLES)
+    *
+    * Z: Z/S samples (up to 8x, must be <= coverage samples and >= color samples):
+    * - Value seen by DB (DB_Z_INFO.NUM_SAMPLES)
+    * - Value seen by CB, must be correct even if Z/S is unbound (DB_EQAA.MAX_ANCHOR_SAMPLES)
+    * # Missing samples are derived from Z planes if Z is compressed (up to 16x quality), or
+    * # from the closest defined sample if Z is uncompressed (same quality as the number of
+    * # Z samples).
+    *
+    * F: Color samples (up to 8x, must be <= coverage samples):
+    * - CB color samples (CB_COLORi_ATTRIB.NUM_FRAGMENTS)
+    * - PS iter samples (DB_EQAA.PS_ITER_SAMPLES)
+    *
+    * Can be anything between coverage and color samples:
+    * - SampleMaskIn samples (PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES)
+    * - SampleMaskOut samples (DB_EQAA.MASK_EXPORT_NUM_SAMPLES)
+    * - Alpha-to-coverage samples (DB_EQAA.ALPHA_TO_MASK_NUM_SAMPLES)
+    * - Occlusion query samples (DB_COUNT_CONTROL.SAMPLE_RATE)
+    * # All are currently set the same as coverage samples.
+    *
+    * If color samples < coverage samples, FMASK has a higher bpp to store an "unknown"
+    * flag for undefined color samples. A shader-based resolve must handle unknowns
+    * or mask them out with AND. Unknowns can also be guessed from neighbors via
+    * an edge-detect shader-based resolve, which is required to make "color samples = 1"
+    * useful. The CB resolve always drops unknowns.
+    *
+    * Sensible AA configurations:
+    *   EQAA 16s 8z 8f - might look the same as 16x MSAA if Z is compressed
+    *   EQAA 16s 8z 4f - might look the same as 16x MSAA if Z is compressed
+    *   EQAA 16s 4z 4f - might look the same as 16x MSAA if Z is compressed
+    *   EQAA  8s 8z 8f = 8x MSAA
+    *   EQAA  8s 8z 4f - might look the same as 8x MSAA
+    *   EQAA  8s 8z 2f - might look the same as 8x MSAA with low-density geometry
+    *   EQAA  8s 4z 4f - might look the same as 8x MSAA if Z is compressed
+    *   EQAA  8s 4z 2f - might look the same as 8x MSAA with low-density geometry if Z is compressed
+    *   EQAA  4s 4z 4f = 4x MSAA
+    *   EQAA  4s 4z 2f - might look the same as 4x MSAA with low-density geometry
+    *   EQAA  2s 2z 2f = 2x MSAA
+    */
+   if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) {
+      coverage_samples = sctx->framebuffer.nr_samples;
+      color_samples = sctx->framebuffer.nr_color_samples;
+
+      if (sctx->framebuffer.state.zsbuf) {
+         z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples;
+         z_samples = MAX2(1, z_samples);
+      } else {
+         z_samples = coverage_samples;
+      }
+   } else if (sctx->smoothing_enabled) {
+      coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES;
+   } else {
+      coverage_samples = color_samples = z_samples = 1;
+   }
+
+   /* Required by OpenGL line rasterization.
+    *
+    * TODO: We should also enable perpendicular endcaps for AA lines,
+    *       but that requires implementing line stippling in the pixel
+    *       shader. SC can only do line stippling with axis-aligned
+    *       endcaps.
+    */
+   unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
+   unsigned sc_aa_config = 0;
+
+   if (coverage_samples > 1) {
+      /* distance from the pixel center, indexed by log2(nr_samples) */
+      static unsigned max_dist[] = {
+         0, /* unused */
+         4, /* 2x MSAA */
+         6, /* 4x MSAA */
+         7, /* 8x MSAA */
+         8, /* 16x MSAA */
+      };
+      unsigned log_samples = util_logbase2(coverage_samples);
+      unsigned log_z_samples = util_logbase2(z_samples);
+      unsigned ps_iter_samples = si_get_ps_iter_samples(sctx);
+      unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
+
+      sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1);
+      sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
+                     S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
+                     S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples);
+
+      if (sctx->framebuffer.nr_samples > 1) {
+         db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
+                    S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
+                    S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
+                    S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
+         sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
+      } else if (sctx->smoothing_enabled) {
+         db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
+      }
+   }
+
+   unsigned initial_cdw = cs->current.cdw;
+
+   /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */
+   radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL,
+                               sc_line_cntl, sc_aa_config);
+   /* R_028804_DB_EQAA */
+   radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, db_eqaa);
+   /* R_028A4C_PA_SC_MODE_CNTL_1 */
+   radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1,
+                              sc_mode_cntl_1);
+
+   if (initial_cdw != cs->current.cdw) {
+      sctx->context_roll = true;
+
+      /* GFX9: Flush DFSM when the AA mode changes. */
+      if (sctx->screen->dfsm_allowed) {
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
+      }
+   }
  }
  
  void si_update_ps_iter_samples(struct si_context *sctx)
  {
-       if (sctx->framebuffer.nr_samples > 1)
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-       if (sctx->screen->dpbb_allowed)
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+   if (sctx->framebuffer.nr_samples > 1)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+   if (sctx->screen->dpbb_allowed)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
  }
  
  static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
  
-       /* The hardware can only do sample shading with 2^n samples. */
-       min_samples = util_next_power_of_two(min_samples);
+   /* The hardware can only do sample shading with 2^n samples. */
+   min_samples = util_next_power_of_two(min_samples);
  
-       if (sctx->ps_iter_samples == min_samples)
-               return;
+   if (sctx->ps_iter_samples == min_samples)
+      return;
  
-       sctx->ps_iter_samples = min_samples;
-       sctx->do_update_shaders = true;
+   sctx->ps_iter_samples = min_samples;
+   sctx->do_update_shaders = true;
  
-       si_update_ps_iter_samples(sctx);
+   si_update_ps_iter_samples(sctx);
  }
  
  /*
@@ -3786,650 +3586,607 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
   * Build the sampler view descriptor for a buffer texture.
   * @param state 256-bit descriptor; only the high 128 bits are filled in
   */
-void
-si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
-                         enum pipe_format format,
-                         unsigned offset, unsigned size,
-                         uint32_t *state)
+void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
+                               enum pipe_format format, unsigned offset, unsigned size,
+                               uint32_t *state)
  {
-       const struct util_format_description *desc;
-       unsigned stride;
-       unsigned num_records;
-
-       desc = util_format_description(format);
-       stride = desc->block.bits / 8;
-
-       num_records = size / stride;
-       num_records = MIN2(num_records, (buf->b.b.width0 - offset) / stride);
-
-       /* The NUM_RECORDS field has a different meaning depending on the chip,
-        * instruction type, STRIDE, and SWIZZLE_ENABLE.
-        *
-        * GFX6-7,10:
-        * - If STRIDE == 0, it's in byte units.
-        * - If STRIDE != 0, it's in units of STRIDE, used with inst.IDXEN.
-        *
-        * GFX8:
-        * - For SMEM and STRIDE == 0, it's in byte units.
-        * - For SMEM and STRIDE != 0, it's in units of STRIDE.
-        * - For VMEM and STRIDE == 0 or SWIZZLE_ENABLE == 0, it's in byte units.
-        * - For VMEM and STRIDE != 0 and SWIZZLE_ENABLE == 1, it's in units of STRIDE.
-        * NOTE: There is incompatibility between VMEM and SMEM opcodes due to SWIZZLE_-
-        *       ENABLE. The workaround is to set STRIDE = 0 if SWIZZLE_ENABLE == 0 when
-        *       using SMEM. This can be done in the shader by clearing STRIDE with s_and.
-        *       That way the same descriptor can be used by both SMEM and VMEM.
-        *
-        * GFX9:
-        * - For SMEM and STRIDE == 0, it's in byte units.
-        * - For SMEM and STRIDE != 0, it's in units of STRIDE.
-        * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units.
-        * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE.
-        */
-       if (screen->info.chip_class == GFX8)
-               num_records *= stride;
-
-       state[4] = 0;
-       state[5] = S_008F04_STRIDE(stride);
-       state[6] = num_records;
-       state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
-                  S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
-                  S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
-                  S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
-
-       if (screen->info.chip_class >= GFX10) {
-               const struct gfx10_format *fmt = &gfx10_format_table[format];
-
-               /* OOB_SELECT chooses the out-of-bounds check:
-                *  - 0: (index >= NUM_RECORDS) || (offset >= STRIDE)
-                *  - 1: index >= NUM_RECORDS
-                *  - 2: NUM_RECORDS == 0
-                *  - 3: if SWIZZLE_ENABLE == 0: offset >= NUM_RECORDS
-                *       else: swizzle_address >= NUM_RECORDS
-                */
-               state[7] |= S_008F0C_FORMAT(fmt->img_format) |
-                           S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
-                           S_008F0C_RESOURCE_LEVEL(1);
-       } else {
-               int first_non_void;
-               unsigned num_format, data_format;
-
-               first_non_void = util_format_get_first_non_void_channel(format);
-               num_format = si_translate_buffer_numformat(&screen->b, desc, first_non_void);
-               data_format = si_translate_buffer_dataformat(&screen->b, desc, first_non_void);
-
-               state[7] |= S_008F0C_NUM_FORMAT(num_format) |
-                           S_008F0C_DATA_FORMAT(data_format);
-       }
+   const struct util_format_description *desc;
+   unsigned stride;
+   unsigned num_records;
+
+   desc = util_format_description(format);
+   stride = desc->block.bits / 8;
+
+   num_records = size / stride;
+   num_records = MIN2(num_records, (buf->b.b.width0 - offset) / stride);
+
+   /* The NUM_RECORDS field has a different meaning depending on the chip,
+    * instruction type, STRIDE, and SWIZZLE_ENABLE.
+    *
+    * GFX6-7,10:
+    * - If STRIDE == 0, it's in byte units.
+    * - If STRIDE != 0, it's in units of STRIDE, used with inst.IDXEN.
+    *
+    * GFX8:
+    * - For SMEM and STRIDE == 0, it's in byte units.
+    * - For SMEM and STRIDE != 0, it's in units of STRIDE.
+    * - For VMEM and STRIDE == 0 or SWIZZLE_ENABLE == 0, it's in byte units.
+    * - For VMEM and STRIDE != 0 and SWIZZLE_ENABLE == 1, it's in units of STRIDE.
+    * NOTE: There is incompatibility between VMEM and SMEM opcodes due to SWIZZLE_-
+    *       ENABLE. The workaround is to set STRIDE = 0 if SWIZZLE_ENABLE == 0 when
+    *       using SMEM. This can be done in the shader by clearing STRIDE with s_and.
+    *       That way the same descriptor can be used by both SMEM and VMEM.
+    *
+    * GFX9:
+    * - For SMEM and STRIDE == 0, it's in byte units.
+    * - For SMEM and STRIDE != 0, it's in units of STRIDE.
+    * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units.
+    * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE.
+    */
+   if (screen->info.chip_class == GFX8)
+      num_records *= stride;
+
+   state[4] = 0;
+   state[5] = S_008F04_STRIDE(stride);
+   state[6] = num_records;
+   state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+              S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+              S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+              S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
+
+   if (screen->info.chip_class >= GFX10) {
+      const struct gfx10_format *fmt = &gfx10_format_table[format];
+
+      /* OOB_SELECT chooses the out-of-bounds check:
+       *  - 0: (index >= NUM_RECORDS) || (offset >= STRIDE)
+       *  - 1: index >= NUM_RECORDS
+       *  - 2: NUM_RECORDS == 0
+       *  - 3: if SWIZZLE_ENABLE == 0: offset >= NUM_RECORDS
+       *       else: swizzle_address >= NUM_RECORDS
+       */
+      state[7] |= S_008F0C_FORMAT(fmt->img_format) |
+                  S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
+                  S_008F0C_RESOURCE_LEVEL(1);
+   } else {
+      int first_non_void;
+      unsigned num_format, data_format;
+
+      first_non_void = util_format_get_first_non_void_channel(format);
+      num_format = si_translate_buffer_numformat(&screen->b, desc, first_non_void);
+      data_format = si_translate_buffer_dataformat(&screen->b, desc, first_non_void);
+
+      state[7] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format);
+   }
  }
  
  static unsigned gfx9_border_color_swizzle(const unsigned char swizzle[4])
  {
-       unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
-
-       if (swizzle[3] == PIPE_SWIZZLE_X) {
-               /* For the pre-defined border color values (white, opaque
-                * black, transparent black), the only thing that matters is
-                * that the alpha channel winds up in the correct place
-                * (because the RGB channels are all the same) so either of
-                * these enumerations will work.
-                */
-               if (swizzle[2] == PIPE_SWIZZLE_Y)
-                       bc_swizzle = V_008F20_BC_SWIZZLE_WZYX;
-               else
-                       bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ;
-       } else if (swizzle[0] == PIPE_SWIZZLE_X) {
-               if (swizzle[1] == PIPE_SWIZZLE_Y)
-                       bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
-               else
-                       bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ;
-       } else if (swizzle[1] == PIPE_SWIZZLE_X) {
-               bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ;
-       } else if (swizzle[2] == PIPE_SWIZZLE_X) {
-               bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW;
-       }
-
-       return bc_swizzle;
+   unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
+
+   if (swizzle[3] == PIPE_SWIZZLE_X) {
+      /* For the pre-defined border color values (white, opaque
+       * black, transparent black), the only thing that matters is
+       * that the alpha channel winds up in the correct place
+       * (because the RGB channels are all the same) so either of
+       * these enumerations will work.
+       */
+      if (swizzle[2] == PIPE_SWIZZLE_Y)
+         bc_swizzle = V_008F20_BC_SWIZZLE_WZYX;
+      else
+         bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ;
+   } else if (swizzle[0] == PIPE_SWIZZLE_X) {
+      if (swizzle[1] == PIPE_SWIZZLE_Y)
+         bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
+      else
+         bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ;
+   } else if (swizzle[1] == PIPE_SWIZZLE_X) {
+      bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ;
+   } else if (swizzle[2] == PIPE_SWIZZLE_X) {
+      bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW;
+   }
+
+   return bc_swizzle;
  }
  
  /**
   * Build the sampler view descriptor for a texture.
   */
-static void
-gfx10_make_texture_descriptor(struct si_screen *screen,
-                             struct si_texture *tex,
-                             bool sampler,
-                             enum pipe_texture_target target,
-                             enum pipe_format pipe_format,
-                             const unsigned char state_swizzle[4],
-                             unsigned first_level, unsigned last_level,
-                             unsigned first_layer, unsigned last_layer,
-                             unsigned width, unsigned height, unsigned depth,
-                             uint32_t *state,
-                             uint32_t *fmask_state)
+static void gfx10_make_texture_descriptor(
+   struct si_screen *screen, struct si_texture *tex, bool sampler, enum pipe_texture_target target,
+   enum pipe_format pipe_format, const unsigned char state_swizzle[4], unsigned first_level,
+   unsigned last_level, unsigned first_layer, unsigned last_layer, unsigned width, unsigned height,
+   unsigned depth, uint32_t *state, uint32_t *fmask_state)
  {
-       struct pipe_resource *res = &tex->buffer.b.b;
-       const struct util_format_description *desc;
-       unsigned img_format;
-       unsigned char swizzle[4];
-       unsigned type;
-       uint64_t va;
-
-       desc = util_format_description(pipe_format);
-       img_format = gfx10_format_table[pipe_format].img_format;
-
-       if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
-               const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
-               const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
-               const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
-               bool is_stencil = false;
-
-               switch (pipe_format) {
-               case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-               case PIPE_FORMAT_X32_S8X24_UINT:
-               case PIPE_FORMAT_X8Z24_UNORM:
-                       util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
-                       is_stencil = true;
-                       break;
-               case PIPE_FORMAT_X24S8_UINT:
-                       /*
-                        * X24S8 is implemented as an 8_8_8_8 data format, to
-                        * fix texture gathers. This affects at least
-                        * GL45-CTS.texture_cube_map_array.sampling on GFX8.
-                        */
-                       util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
-                       is_stencil = true;
-                       break;
-               default:
-                       util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
-                       is_stencil = pipe_format == PIPE_FORMAT_S8_UINT;
-               }
-
-               if (tex->upgraded_depth && !is_stencil) {
-                       assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT);
-                       img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP;
-               }
-       } else {
-               util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
-       }
-
-       if (!sampler &&
-           (res->target == PIPE_TEXTURE_CUBE ||
-            res->target == PIPE_TEXTURE_CUBE_ARRAY)) {
-               /* For the purpose of shader images, treat cube maps as 2D
-                * arrays.
-                */
-               type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
-       } else {
-               type = si_tex_dim(screen, tex, target, res->nr_samples);
-       }
-
-       if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
-               height = 1;
-               depth = res->array_size;
-       } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY ||
-                  type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
-               if (sampler || res->target != PIPE_TEXTURE_3D)
-                       depth = res->array_size;
-       } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
-               depth = res->array_size / 6;
-
-       state[0] = 0;
-       state[1] = S_00A004_FORMAT(img_format) |
-                  S_00A004_WIDTH_LO(width - 1);
-       state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) |
-                  S_00A008_HEIGHT(height - 1) |
-                  S_00A008_RESOURCE_LEVEL(1);
-       state[3] = S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
-                  S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
-                  S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
-                  S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
-                  S_00A00C_BASE_LEVEL(res->nr_samples > 1 ?
-                                       0 : first_level) |
-                  S_00A00C_LAST_LEVEL(res->nr_samples > 1 ?
-                                       util_logbase2(res->nr_samples) :
-                                       last_level) |
-                  S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) |
-                  S_00A00C_TYPE(type);
-       /* Depth is the the last accessible layer on gfx9+. The hw doesn't need
-        * to know the total number of layers.
-        */
-       state[4] = S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler)
-                                 ? depth - 1 : last_layer) |
-                  S_00A010_BASE_ARRAY(first_layer);
-       state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) |
-                  S_00A014_MAX_MIP(res->nr_samples > 1 ?
-                                   util_logbase2(res->nr_samples) :
-                                   tex->buffer.b.b.last_level) |
-                  S_00A014_PERF_MOD(4);
-       state[6] = 0;
-       state[7] = 0;
-
-       if (tex->surface.dcc_offset) {
-               state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
-                           S_00A018_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
-                           S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
-       }
-
-       /* Initialize the sampler view for FMASK. */
-       if (tex->surface.fmask_offset) {
-               uint32_t format;
-
-               va = tex->buffer.gpu_address + tex->surface.fmask_offset;
-
-#define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
-               switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
-               case FMASK(2,1):
-                       format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1;
-                       break;
-               case FMASK(2,2):
-                       format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2;
-                       break;
-               case FMASK(4,1):
-                       format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1;
-                       break;
-               case FMASK(4,2):
-                       format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2;
-                       break;
-               case FMASK(4,4):
-                       format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4;
-                       break;
-               case FMASK(8,1):
-                       format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1;
-                       break;
-               case FMASK(8,2):
-                       format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2;
-                       break;
-               case FMASK(8,4):
-                       format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4;
-                       break;
-               case FMASK(8,8):
-                       format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8;
-                       break;
-               case FMASK(16,1):
-                       format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1;
-                       break;
-               case FMASK(16,2):
-                       format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2;
-                       break;
-               case FMASK(16,4):
-                       format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4;
-                       break;
-               case FMASK(16,8):
-                       format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8;
-                       break;
-               default:
-                       unreachable("invalid nr_samples");
-               }
+   struct pipe_resource *res = &tex->buffer.b.b;
+   const struct util_format_description *desc;
+   unsigned img_format;
+   unsigned char swizzle[4];
+   unsigned type;
+   uint64_t va;
+
+   desc = util_format_description(pipe_format);
+   img_format = gfx10_format_table[pipe_format].img_format;
+
+   if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
+      const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
+      const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
+      bool is_stencil = false;
+
+      switch (pipe_format) {
+      case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      case PIPE_FORMAT_X32_S8X24_UINT:
+      case PIPE_FORMAT_X8Z24_UNORM:
+         util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
+         is_stencil = true;
+         break;
+      case PIPE_FORMAT_X24S8_UINT:
+         /*
+          * X24S8 is implemented as an 8_8_8_8 data format, to
+          * fix texture gathers. This affects at least
+          * GL45-CTS.texture_cube_map_array.sampling on GFX8.
+          */
+         util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
+         is_stencil = true;
+         break;
+      default:
+         util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
+         is_stencil = pipe_format == PIPE_FORMAT_S8_UINT;
+      }
+
+      if (tex->upgraded_depth && !is_stencil) {
+         assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT);
+         img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP;
+      }
+   } else {
+      util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
+   }
+
+   if (!sampler && (res->target == PIPE_TEXTURE_CUBE || res->target == PIPE_TEXTURE_CUBE_ARRAY)) {
+      /* For the purpose of shader images, treat cube maps as 2D
+       * arrays.
+       */
+      type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
+   } else {
+      type = si_tex_dim(screen, tex, target, res->nr_samples);
+   }
+
+   if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
+      height = 1;
+      depth = res->array_size;
+   } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
+      if (sampler || res->target != PIPE_TEXTURE_3D)
+         depth = res->array_size;
+   } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
+      depth = res->array_size / 6;
+
+   state[0] = 0;
+   state[1] = S_00A004_FORMAT(img_format) | S_00A004_WIDTH_LO(width - 1);
+   state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) |
+              S_00A008_RESOURCE_LEVEL(1);
+   state[3] =
+      S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
+      S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
+      S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
+      S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
+      S_00A00C_BASE_LEVEL(res->nr_samples > 1 ? 0 : first_level) |
+      S_00A00C_LAST_LEVEL(res->nr_samples > 1 ? util_logbase2(res->nr_samples) : last_level) |
+      S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) | S_00A00C_TYPE(type);
+   /* Depth is the the last accessible layer on gfx9+. The hw doesn't need
+    * to know the total number of layers.
+    */
+   state[4] =
+      S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler) ? depth - 1 : last_layer) |
+      S_00A010_BASE_ARRAY(first_layer);
+   state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) |
+              S_00A014_MAX_MIP(res->nr_samples > 1 ? util_logbase2(res->nr_samples)
+                                                   : tex->buffer.b.b.last_level) |
+              S_00A014_PERF_MOD(4);
+   state[6] = 0;
+   state[7] = 0;
+
+   if (tex->surface.dcc_offset) {
+      state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
+                  S_00A018_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
+                  S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
+   }
+
+   /* Initialize the sampler view for FMASK. */
+   if (tex->surface.fmask_offset) {
+      uint32_t format;
+
+      va = tex->buffer.gpu_address + tex->surface.fmask_offset;
+
+#define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
+      switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
+      case FMASK(2, 1):
+         format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1;
+         break;
+      case FMASK(2, 2):
+         format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2;
+         break;
+      case FMASK(4, 1):
+         format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1;
+         break;
+      case FMASK(4, 2):
+         format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2;
+         break;
+      case FMASK(4, 4):
+         format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4;
+         break;
+      case FMASK(8, 1):
+         format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1;
+         break;
+      case FMASK(8, 2):
+         format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2;
+         break;
+      case FMASK(8, 4):
+         format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4;
+         break;
+      case FMASK(8, 8):
+         format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8;
+         break;
+      case FMASK(16, 1):
+         format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1;
+         break;
+      case FMASK(16, 2):
+         format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2;
+         break;
+      case FMASK(16, 4):
+         format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4;
+         break;
+      case FMASK(16, 8):
+         format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8;
+         break;
+      default:
+         unreachable("invalid nr_samples");
+      }
  #undef FMASK
-               fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
-               fmask_state[1] = S_00A004_BASE_ADDRESS_HI(va >> 40) |
-                                S_00A004_FORMAT(format) |
-                                S_00A004_WIDTH_LO(width - 1);
-               fmask_state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) |
-                                S_00A008_HEIGHT(height - 1) |
-                                S_00A008_RESOURCE_LEVEL(1);
-               fmask_state[3] = S_00A00C_DST_SEL_X(V_008F1C_SQ_SEL_X) |
-                                S_00A00C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
-                                S_00A00C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
-                                S_00A00C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
-                                S_00A00C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
-                                S_00A00C_TYPE(si_tex_dim(screen, tex, target, 0));
-               fmask_state[4] = S_00A010_DEPTH(last_layer) |
-                                S_00A010_BASE_ARRAY(first_layer);
-               fmask_state[5] = 0;
-               fmask_state[6] = S_00A018_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned);
-               fmask_state[7] = 0;
-       }
+      fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
+      fmask_state[1] = S_00A004_BASE_ADDRESS_HI(va >> 40) | S_00A004_FORMAT(format) |
+                       S_00A004_WIDTH_LO(width - 1);
+      fmask_state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) |
+                       S_00A008_RESOURCE_LEVEL(1);
+      fmask_state[3] =
+         S_00A00C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
+         S_00A00C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
+         S_00A00C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
+         S_00A00C_TYPE(si_tex_dim(screen, tex, target, 0));
+      fmask_state[4] = S_00A010_DEPTH(last_layer) | S_00A010_BASE_ARRAY(first_layer);
+      fmask_state[5] = 0;
+      fmask_state[6] = S_00A018_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned);
+      fmask_state[7] = 0;
+   }
  }
  
  /**
   * Build the sampler view descriptor for a texture (SI-GFX9).
   */
-static void
-si_make_texture_descriptor(struct si_screen *screen,
-                          struct si_texture *tex,
-                          bool sampler,
-                          enum pipe_texture_target target,
-                          enum pipe_format pipe_format,
-                          const unsigned char state_swizzle[4],
-                          unsigned first_level, unsigned last_level,
-                          unsigned first_layer, unsigned last_layer,
-                          unsigned width, unsigned height, unsigned depth,
-                          uint32_t *state,
-                          uint32_t *fmask_state)
+static void si_make_texture_descriptor(struct si_screen *screen, struct si_texture *tex,
+                                       bool sampler, enum pipe_texture_target target,
+                                       enum pipe_format pipe_format,
+                                       const unsigned char state_swizzle[4], unsigned first_level,
+                                       unsigned last_level, unsigned first_layer,
+                                       unsigned last_layer, unsigned width, unsigned height,
+                                       unsigned depth, uint32_t *state, uint32_t *fmask_state)
  {
-       struct pipe_resource *res = &tex->buffer.b.b;
-       const struct util_format_description *desc;
-       unsigned char swizzle[4];
-       int first_non_void;
-       unsigned num_format, data_format, type, num_samples;
-       uint64_t va;
-
-       desc = util_format_description(pipe_format);
-
-       num_samples = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ?
-                       MAX2(1, res->nr_samples) :
-                       MAX2(1, res->nr_storage_samples);
-
-       if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
-               const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
-               const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
-               const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
-
-               switch (pipe_format) {
-               case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-               case PIPE_FORMAT_X32_S8X24_UINT:
-               case PIPE_FORMAT_X8Z24_UNORM:
-                       util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
-                       break;
-               case PIPE_FORMAT_X24S8_UINT:
-                       /*
-                        * X24S8 is implemented as an 8_8_8_8 data format, to
-                        * fix texture gathers. This affects at least
-                        * GL45-CTS.texture_cube_map_array.sampling on GFX8.
-                        */
-                       if (screen->info.chip_class <= GFX8)
-                               util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
-                       else
-                               util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
-                       break;
-               default:
-                       util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
-               }
-       } else {
-               util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
-       }
-
-       first_non_void = util_format_get_first_non_void_channel(pipe_format);
-
-       switch (pipe_format) {
-       case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-               num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
-               break;
-       default:
-               if (first_non_void < 0) {
-                       if (util_format_is_compressed(pipe_format)) {
-                               switch (pipe_format) {
-                               case PIPE_FORMAT_DXT1_SRGB:
-                               case PIPE_FORMAT_DXT1_SRGBA:
-                               case PIPE_FORMAT_DXT3_SRGBA:
-                               case PIPE_FORMAT_DXT5_SRGBA:
-                               case PIPE_FORMAT_BPTC_SRGBA:
-                               case PIPE_FORMAT_ETC2_SRGB8:
-                               case PIPE_FORMAT_ETC2_SRGB8A1:
-                               case PIPE_FORMAT_ETC2_SRGBA8:
-                                       num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
-                                       break;
-                               case PIPE_FORMAT_RGTC1_SNORM:
-                               case PIPE_FORMAT_LATC1_SNORM:
-                               case PIPE_FORMAT_RGTC2_SNORM:
-                               case PIPE_FORMAT_LATC2_SNORM:
-                               case PIPE_FORMAT_ETC2_R11_SNORM:
-                               case PIPE_FORMAT_ETC2_RG11_SNORM:
-                               /* implies float, so use SNORM/UNORM to determine
-                                  whether data is signed or not */
-                               case PIPE_FORMAT_BPTC_RGB_FLOAT:
-                                       num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
-                                       break;
-                               default:
-                                       num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
-                                       break;
-                               }
-                       } else if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
-                               num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
-                       } else {
-                               num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
-                       }
-               } else if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
-                       num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
-               } else {
-                       num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
-
-                       switch (desc->channel[first_non_void].type) {
-                       case UTIL_FORMAT_TYPE_FLOAT:
-                               num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
-                               break;
-                       case UTIL_FORMAT_TYPE_SIGNED:
-                               if (desc->channel[first_non_void].normalized)
-                                       num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
-                               else if (desc->channel[first_non_void].pure_integer)
-                                       num_format = V_008F14_IMG_NUM_FORMAT_SINT;
-                               else
-                                       num_format = V_008F14_IMG_NUM_FORMAT_SSCALED;
-                               break;
-                       case UTIL_FORMAT_TYPE_UNSIGNED:
-                               if (desc->channel[first_non_void].normalized)
-                                       num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
-                               else if (desc->channel[first_non_void].pure_integer)
-                                       num_format = V_008F14_IMG_NUM_FORMAT_UINT;
-                               else
-                                       num_format = V_008F14_IMG_NUM_FORMAT_USCALED;
-                       }
-               }
-       }
-
-       data_format = si_translate_texformat(&screen->b, pipe_format, desc, first_non_void);
-       if (data_format == ~0) {
-               data_format = 0;
-       }
-
-       /* S8 with Z32 HTILE needs a special format. */
-       if (screen->info.chip_class == GFX9 &&
-           pipe_format == PIPE_FORMAT_S8_UINT &&
-           tex->tc_compatible_htile)
-               data_format = V_008F14_IMG_DATA_FORMAT_S8_32;
-
-       if (!sampler &&
-           (res->target == PIPE_TEXTURE_CUBE ||
-            res->target == PIPE_TEXTURE_CUBE_ARRAY ||
-            (screen->info.chip_class <= GFX8 &&
-             res->target == PIPE_TEXTURE_3D))) {
-               /* For the purpose of shader images, treat cube maps and 3D
-                * textures as 2D arrays. For 3D textures, the address
-                * calculations for mipmaps are different, so we rely on the
-                * caller to effectively disable mipmaps.
-                */
-               type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
-
-               assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0));
-       } else {
-               type = si_tex_dim(screen, tex, target, num_samples);
-       }
-
-       if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
-               height = 1;
-               depth = res->array_size;
-       } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY ||
-                  type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
-               if (sampler || res->target != PIPE_TEXTURE_3D)
-                       depth = res->array_size;
-       } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
-               depth = res->array_size / 6;
-
-       state[0] = 0;
-       state[1] = (S_008F14_DATA_FORMAT(data_format) |
-                   S_008F14_NUM_FORMAT(num_format));
-       state[2] = (S_008F18_WIDTH(width - 1) |
-                   S_008F18_HEIGHT(height - 1) |
-                   S_008F18_PERF_MOD(4));
-       state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
-                   S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
-                   S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
-                   S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
-                   S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) |
-                   S_008F1C_LAST_LEVEL(num_samples > 1 ?
-                                       util_logbase2(num_samples) :
-                                       last_level) |
-                   S_008F1C_TYPE(type));
-       state[4] = 0;
-       state[5] = S_008F24_BASE_ARRAY(first_layer);
-       state[6] = 0;
-       state[7] = 0;
-
-       if (screen->info.chip_class == GFX9) {
-               unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle);
-
-               /* Depth is the the last accessible layer on Gfx9.
-                * The hw doesn't need to know the total number of layers.
-                */
-               if (type == V_008F1C_SQ_RSRC_IMG_3D)
-                       state[4] |= S_008F20_DEPTH(depth - 1);
-               else
-                       state[4] |= S_008F20_DEPTH(last_layer);
-
-               state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle);
-               state[5] |= S_008F24_MAX_MIP(num_samples > 1 ?
-                                            util_logbase2(num_samples) :
-                                            tex->buffer.b.b.last_level);
-       } else {
-               state[3] |= S_008F1C_POW2_PAD(res->last_level > 0);
-               state[4] |= S_008F20_DEPTH(depth - 1);
-               state[5] |= S_008F24_LAST_ARRAY(last_layer);
-       }
-
-       if (tex->surface.dcc_offset) {
-               state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
-       } else {
-               /* The last dword is unused by hw. The shader uses it to clear
-                * bits in the first dword of sampler state.
-                */
-               if (screen->info.chip_class <= GFX7 && res->nr_samples <= 1) {
-                       if (first_level == last_level)
-                               state[7] = C_008F30_MAX_ANISO_RATIO;
-                       else
-                               state[7] = 0xffffffff;
-               }
-       }
-
-       /* Initialize the sampler view for FMASK. */
-       if (tex->surface.fmask_offset) {
-               uint32_t data_format, num_format;
-
-               va = tex->buffer.gpu_address + tex->surface.fmask_offset;
-
-#define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
-               if (screen->info.chip_class == GFX9) {
-                       data_format = V_008F14_IMG_DATA_FORMAT_FMASK;
-                       switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
-                       case FMASK(2,1):
-                               num_format = V_008F14_IMG_FMASK_8_2_1;
-                               break;
-                       case FMASK(2,2):
-                               num_format = V_008F14_IMG_FMASK_8_2_2;
-                               break;
-                       case FMASK(4,1):
-                               num_format = V_008F14_IMG_FMASK_8_4_1;
-                               break;
-                       case FMASK(4,2):
-                               num_format = V_008F14_IMG_FMASK_8_4_2;
-                               break;
-                       case FMASK(4,4):
-                               num_format = V_008F14_IMG_FMASK_8_4_4;
-                               break;
-                       case FMASK(8,1):
-                               num_format = V_008F14_IMG_FMASK_8_8_1;
-                               break;
-                       case FMASK(8,2):
-                               num_format = V_008F14_IMG_FMASK_16_8_2;
-                               break;
-                       case FMASK(8,4):
-                               num_format = V_008F14_IMG_FMASK_32_8_4;
-                               break;
-                       case FMASK(8,8):
-                               num_format = V_008F14_IMG_FMASK_32_8_8;
-                               break;
-                       case FMASK(16,1):
-                               num_format = V_008F14_IMG_FMASK_16_16_1;
-                               break;
-                       case FMASK(16,2):
-                               num_format = V_008F14_IMG_FMASK_32_16_2;
-                               break;
-                       case FMASK(16,4):
-                               num_format = V_008F14_IMG_FMASK_64_16_4;
-                               break;
-                       case FMASK(16,8):
-                               num_format = V_008F14_IMG_FMASK_64_16_8;
-                               break;
-                       default:
-                               unreachable("invalid nr_samples");
-                       }
-               } else {
-                       switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
-                       case FMASK(2,1):
-                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1;
-                               break;
-                       case FMASK(2,2):
-                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2;
-                               break;
-                       case FMASK(4,1):
-                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1;
-                               break;
-                       case FMASK(4,2):
-                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2;
-                               break;
-                       case FMASK(4,4):
-                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4;
-                               break;
-                       case FMASK(8,1):
-                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1;
-                               break;
-                       case FMASK(8,2):
-                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2;
-                               break;
-                       case FMASK(8,4):
-                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4;
-                               break;
-                       case FMASK(8,8):
-                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8;
-                               break;
-                       case FMASK(16,1):
-                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1;
-                               break;
-                       case FMASK(16,2):
-                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2;
-                               break;
-                       case FMASK(16,4):
-                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4;
-                               break;
-                       case FMASK(16,8):
-                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8;
-                               break;
-                       default:
-                               unreachable("invalid nr_samples");
-                       }
-                       num_format = V_008F14_IMG_NUM_FORMAT_UINT;
-               }
+   struct pipe_resource *res = &tex->buffer.b.b;
+   const struct util_format_description *desc;
+   unsigned char swizzle[4];
+   int first_non_void;
+   unsigned num_format, data_format, type, num_samples;
+   uint64_t va;
+
+   desc = util_format_description(pipe_format);
+
+   num_samples = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ? MAX2(1, res->nr_samples)
+                                                               : MAX2(1, res->nr_storage_samples);
+
+   if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
+      const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
+      const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
+
+      switch (pipe_format) {
+      case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      case PIPE_FORMAT_X32_S8X24_UINT:
+      case PIPE_FORMAT_X8Z24_UNORM:
+         util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
+         break;
+      case PIPE_FORMAT_X24S8_UINT:
+         /*
+          * X24S8 is implemented as an 8_8_8_8 data format, to
+          * fix texture gathers. This affects at least
+          * GL45-CTS.texture_cube_map_array.sampling on GFX8.
+          */
+         if (screen->info.chip_class <= GFX8)
+            util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
+         else
+            util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
+         break;
+      default:
+         util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
+      }
+   } else {
+      util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
+   }
+
+   first_non_void = util_format_get_first_non_void_channel(pipe_format);
+
+   switch (pipe_format) {
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+      break;
+   default:
+      if (first_non_void < 0) {
+         if (util_format_is_compressed(pipe_format)) {
+            switch (pipe_format) {
+            case PIPE_FORMAT_DXT1_SRGB:
+            case PIPE_FORMAT_DXT1_SRGBA:
+            case PIPE_FORMAT_DXT3_SRGBA:
+            case PIPE_FORMAT_DXT5_SRGBA:
+            case PIPE_FORMAT_BPTC_SRGBA:
+            case PIPE_FORMAT_ETC2_SRGB8:
+            case PIPE_FORMAT_ETC2_SRGB8A1:
+            case PIPE_FORMAT_ETC2_SRGBA8:
+               num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
+               break;
+            case PIPE_FORMAT_RGTC1_SNORM:
+            case PIPE_FORMAT_LATC1_SNORM:
+            case PIPE_FORMAT_RGTC2_SNORM:
+            case PIPE_FORMAT_LATC2_SNORM:
+            case PIPE_FORMAT_ETC2_R11_SNORM:
+            case PIPE_FORMAT_ETC2_RG11_SNORM:
+            /* implies float, so use SNORM/UNORM to determine
+               whether data is signed or not */
+            case PIPE_FORMAT_BPTC_RGB_FLOAT:
+               num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
+               break;
+            default:
+               num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+               break;
+            }
+         } else if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
+            num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+         } else {
+            num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
+         }
+      } else if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+         num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
+      } else {
+         num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+
+         switch (desc->channel[first_non_void].type) {
+         case UTIL_FORMAT_TYPE_FLOAT:
+            num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
+            break;
+         case UTIL_FORMAT_TYPE_SIGNED:
+            if (desc->channel[first_non_void].normalized)
+               num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
+            else if (desc->channel[first_non_void].pure_integer)
+               num_format = V_008F14_IMG_NUM_FORMAT_SINT;
+            else
+               num_format = V_008F14_IMG_NUM_FORMAT_SSCALED;
+            break;
+         case UTIL_FORMAT_TYPE_UNSIGNED:
+            if (desc->channel[first_non_void].normalized)
+               num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+            else if (desc->channel[first_non_void].pure_integer)
+               num_format = V_008F14_IMG_NUM_FORMAT_UINT;
+            else
+               num_format = V_008F14_IMG_NUM_FORMAT_USCALED;
+         }
+      }
+   }
+
+   data_format = si_translate_texformat(&screen->b, pipe_format, desc, first_non_void);
+   if (data_format == ~0) {
+      data_format = 0;
+   }
+
+   /* S8 with Z32 HTILE needs a special format. */
+   if (screen->info.chip_class == GFX9 && pipe_format == PIPE_FORMAT_S8_UINT &&
+       tex->tc_compatible_htile)
+      data_format = V_008F14_IMG_DATA_FORMAT_S8_32;
+
+   if (!sampler && (res->target == PIPE_TEXTURE_CUBE || res->target == PIPE_TEXTURE_CUBE_ARRAY ||
+                    (screen->info.chip_class <= GFX8 && res->target == PIPE_TEXTURE_3D))) {
+      /* For the purpose of shader images, treat cube maps and 3D
+       * textures as 2D arrays. For 3D textures, the address
+       * calculations for mipmaps are different, so we rely on the
+       * caller to effectively disable mipmaps.
+       */
+      type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
+
+      assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0));
+   } else {
+      type = si_tex_dim(screen, tex, target, num_samples);
+   }
+
+   if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
+      height = 1;
+      depth = res->array_size;
+   } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
+      if (sampler || res->target != PIPE_TEXTURE_3D)
+         depth = res->array_size;
+   } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
+      depth = res->array_size / 6;
+
+   state[0] = 0;
+   state[1] = (S_008F14_DATA_FORMAT(data_format) | S_008F14_NUM_FORMAT(num_format));
+   state[2] = (S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1) | S_008F18_PERF_MOD(4));
+   state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
+               S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
+               S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
+               S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
+               S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) |
+               S_008F1C_LAST_LEVEL(num_samples > 1 ? util_logbase2(num_samples) : last_level) |
+               S_008F1C_TYPE(type));
+   state[4] = 0;
+   state[5] = S_008F24_BASE_ARRAY(first_layer);
+   state[6] = 0;
+   state[7] = 0;
+
+   if (screen->info.chip_class == GFX9) {
+      unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle);
+
+      /* Depth is the the last accessible layer on Gfx9.
+       * The hw doesn't need to know the total number of layers.
+       */
+      if (type == V_008F1C_SQ_RSRC_IMG_3D)
+         state[4] |= S_008F20_DEPTH(depth - 1);
+      else
+         state[4] |= S_008F20_DEPTH(last_layer);
+
+      state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle);
+      state[5] |= S_008F24_MAX_MIP(num_samples > 1 ? util_logbase2(num_samples)
+                                                   : tex->buffer.b.b.last_level);
+   } else {
+      state[3] |= S_008F1C_POW2_PAD(res->last_level > 0);
+      state[4] |= S_008F20_DEPTH(depth - 1);
+      state[5] |= S_008F24_LAST_ARRAY(last_layer);
+   }
+
+   if (tex->surface.dcc_offset) {
+      state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
+   } else {
+      /* The last dword is unused by hw. The shader uses it to clear
+       * bits in the first dword of sampler state.
+       */
+      if (screen->info.chip_class <= GFX7 && res->nr_samples <= 1) {
+         if (first_level == last_level)
+            state[7] = C_008F30_MAX_ANISO_RATIO;
+         else
+            state[7] = 0xffffffff;
+      }
+   }
+
+   /* Initialize the sampler view for FMASK. */
+   if (tex->surface.fmask_offset) {
+      uint32_t data_format, num_format;
+
+      va = tex->buffer.gpu_address + tex->surface.fmask_offset;
+
+#define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
+      if (screen->info.chip_class == GFX9) {
+         data_format = V_008F14_IMG_DATA_FORMAT_FMASK;
+         switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
+         case FMASK(2, 1):
+            num_format = V_008F14_IMG_FMASK_8_2_1;
+            break;
+         case FMASK(2, 2):
+            num_format = V_008F14_IMG_FMASK_8_2_2;
+            break;
+         case FMASK(4, 1):
+            num_format = V_008F14_IMG_FMASK_8_4_1;
+            break;
+         case FMASK(4, 2):
+            num_format = V_008F14_IMG_FMASK_8_4_2;
+            break;
+         case FMASK(4, 4):
+            num_format = V_008F14_IMG_FMASK_8_4_4;
+            break;
+         case FMASK(8, 1):
+            num_format = V_008F14_IMG_FMASK_8_8_1;
+            break;
+         case FMASK(8, 2):
+            num_format = V_008F14_IMG_FMASK_16_8_2;
+            break;
+         case FMASK(8, 4):
+            num_format = V_008F14_IMG_FMASK_32_8_4;
+            break;
+         case FMASK(8, 8):
+            num_format = V_008F14_IMG_FMASK_32_8_8;
+            break;
+         case FMASK(16, 1):
+            num_format = V_008F14_IMG_FMASK_16_16_1;
+            break;
+         case FMASK(16, 2):
+            num_format = V_008F14_IMG_FMASK_32_16_2;
+            break;
+         case FMASK(16, 4):
+            num_format = V_008F14_IMG_FMASK_64_16_4;
+            break;
+         case FMASK(16, 8):
+            num_format = V_008F14_IMG_FMASK_64_16_8;
+            break;
+         default:
+            unreachable("invalid nr_samples");
+         }
+      } else {
+         switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
+         case FMASK(2, 1):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1;
+            break;
+         case FMASK(2, 2):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2;
+            break;
+         case FMASK(4, 1):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1;
+            break;
+         case FMASK(4, 2):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2;
+            break;
+         case FMASK(4, 4):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4;
+            break;
+         case FMASK(8, 1):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1;
+            break;
+         case FMASK(8, 2):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2;
+            break;
+         case FMASK(8, 4):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4;
+            break;
+         case FMASK(8, 8):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8;
+            break;
+         case FMASK(16, 1):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1;
+            break;
+         case FMASK(16, 2):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2;
+            break;
+         case FMASK(16, 4):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4;
+            break;
+         case FMASK(16, 8):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8;
+            break;
+         default:
+            unreachable("invalid nr_samples");
+         }
+         num_format = V_008F14_IMG_NUM_FORMAT_UINT;
+      }
  #undef FMASK
  
-               fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
-               fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) |
-                                S_008F14_DATA_FORMAT(data_format) |
-                                S_008F14_NUM_FORMAT(num_format);
-               fmask_state[2] = S_008F18_WIDTH(width - 1) |
-                                S_008F18_HEIGHT(height - 1);
-               fmask_state[3] = S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) |
-                                S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
-                                S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
-                                S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
-                                S_008F1C_TYPE(si_tex_dim(screen, tex, target, 0));
-               fmask_state[4] = 0;
-               fmask_state[5] = S_008F24_BASE_ARRAY(first_layer);
-               fmask_state[6] = 0;
-               fmask_state[7] = 0;
-
-               if (screen->info.chip_class == GFX9) {
-                       fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode);
-                       fmask_state[4] |= S_008F20_DEPTH(last_layer) |
-                                         S_008F20_PITCH(tex->surface.u.gfx9.fmask.epitch);
-                       fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
-                                         S_008F24_META_RB_ALIGNED(tex->surface.u.gfx9.cmask.rb_aligned);
-               } else {
-                       fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index);
-                       fmask_state[4] |= S_008F20_DEPTH(depth - 1) |
-                                         S_008F20_PITCH(tex->surface.u.legacy.fmask.pitch_in_pixels - 1);
-                       fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer);
-               }
-       }
+      fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
+      fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) | S_008F14_DATA_FORMAT(data_format) |
+                       S_008F14_NUM_FORMAT(num_format);
+      fmask_state[2] = S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1);
+      fmask_state[3] =
+         S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
+         S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
+         S_008F1C_TYPE(si_tex_dim(screen, tex, target, 0));
+      fmask_state[4] = 0;
+      fmask_state[5] = S_008F24_BASE_ARRAY(first_layer);
+      fmask_state[6] = 0;
+      fmask_state[7] = 0;
+
+      if (screen->info.chip_class == GFX9) {
+         fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode);
+         fmask_state[4] |=
+            S_008F20_DEPTH(last_layer) | S_008F20_PITCH(tex->surface.u.gfx9.fmask.epitch);
+         fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
+                           S_008F24_META_RB_ALIGNED(tex->surface.u.gfx9.cmask.rb_aligned);
+      } else {
+         fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index);
+         fmask_state[4] |= S_008F20_DEPTH(depth - 1) |
+                           S_008F20_PITCH(tex->surface.u.legacy.fmask.pitch_in_pixels - 1);
+         fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer);
+      }
+   }
  }
  
  /**
@@ -4442,1282 +4199,1195 @@ si_make_texture_descriptor(struct si_screen *screen,
   * @param height0      height0 override (for compressed textures as int)
   * @param force_level   set the base address to the level (for compressed textures)
   */
-struct pipe_sampler_view *
-si_create_sampler_view_custom(struct pipe_context *ctx,
-                             struct pipe_resource *texture,
-                             const struct pipe_sampler_view *state,
-                             unsigned width0, unsigned height0,
-                             unsigned force_level)
+struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx,
+                                                        struct pipe_resource *texture,
+                                                        const struct pipe_sampler_view *state,
+                                                        unsigned width0, unsigned height0,
+                                                        unsigned force_level)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
-       struct si_texture *tex = (struct si_texture*)texture;
-       unsigned base_level, first_level, last_level;
-       unsigned char state_swizzle[4];
-       unsigned height, depth, width;
-       unsigned last_layer = state->u.tex.last_layer;
-       enum pipe_format pipe_format;
-       const struct legacy_surf_level *surflevel;
-
-       if (!view)
-               return NULL;
-
-       /* initialize base object */
-       view->base = *state;
-       view->base.texture = NULL;
-       view->base.reference.count = 1;
-       view->base.context = ctx;
-
-       assert(texture);
-       pipe_resource_reference(&view->base.texture, texture);
-
-       if (state->format == PIPE_FORMAT_X24S8_UINT ||
-           state->format == PIPE_FORMAT_S8X24_UINT ||
-           state->format == PIPE_FORMAT_X32_S8X24_UINT ||
-           state->format == PIPE_FORMAT_S8_UINT)
-               view->is_stencil_sampler = true;
-
-       /* Buffer resource. */
-       if (texture->target == PIPE_BUFFER) {
-               si_make_buffer_descriptor(sctx->screen,
-                                         si_resource(texture),
-                                         state->format,
-                                         state->u.buf.offset,
-                                         state->u.buf.size,
-                                         view->state);
-               return &view->base;
-       }
-
-       state_swizzle[0] = state->swizzle_r;
-       state_swizzle[1] = state->swizzle_g;
-       state_swizzle[2] = state->swizzle_b;
-       state_swizzle[3] = state->swizzle_a;
-
-       base_level = 0;
-       first_level = state->u.tex.first_level;
-       last_level = state->u.tex.last_level;
-       width = width0;
-       height = height0;
-       depth = texture->depth0;
-
-       if (sctx->chip_class <= GFX8 && force_level) {
-               assert(force_level == first_level &&
-                      force_level == last_level);
-               base_level = force_level;
-               first_level = 0;
-               last_level = 0;
-               width = u_minify(width, force_level);
-               height = u_minify(height, force_level);
-               depth = u_minify(depth, force_level);
-       }
-
-       /* This is not needed if state trackers set last_layer correctly. */
-       if (state->target == PIPE_TEXTURE_1D ||
-           state->target == PIPE_TEXTURE_2D ||
-           state->target == PIPE_TEXTURE_RECT ||
-           state->target == PIPE_TEXTURE_CUBE)
-               last_layer = state->u.tex.first_layer;
-
-       /* Texturing with separate depth and stencil. */
-       pipe_format = state->format;
-
-       /* Depth/stencil texturing sometimes needs separate texture. */
-       if (tex->is_depth && !si_can_sample_zs(tex, view->is_stencil_sampler)) {
-               if (!tex->flushed_depth_texture &&
-                   !si_init_flushed_depth_texture(ctx, texture)) {
-                       pipe_resource_reference(&view->base.texture, NULL);
-                       FREE(view);
-                       return NULL;
-               }
-
-               assert(tex->flushed_depth_texture);
-
-               /* Override format for the case where the flushed texture
-                * contains only Z or only S.
-                */
-               if (tex->flushed_depth_texture->buffer.b.b.format != tex->buffer.b.b.format)
-                       pipe_format = tex->flushed_depth_texture->buffer.b.b.format;
-
-               tex = tex->flushed_depth_texture;
-       }
-
-       surflevel = tex->surface.u.legacy.level;
-
-       if (tex->db_compatible) {
-               if (!view->is_stencil_sampler)
-                       pipe_format = tex->db_render_format;
-
-               switch (pipe_format) {
-               case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-                       pipe_format = PIPE_FORMAT_Z32_FLOAT;
-                       break;
-               case PIPE_FORMAT_X8Z24_UNORM:
-               case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-                       /* Z24 is always stored like this for DB
-                        * compatibility.
-                        */
-                       pipe_format = PIPE_FORMAT_Z24X8_UNORM;
-                       break;
-               case PIPE_FORMAT_X24S8_UINT:
-               case PIPE_FORMAT_S8X24_UINT:
-               case PIPE_FORMAT_X32_S8X24_UINT:
-                       pipe_format = PIPE_FORMAT_S8_UINT;
-                       surflevel = tex->surface.u.legacy.stencil_level;
-                       break;
-               default:;
-               }
-       }
-
-       view->dcc_incompatible =
-               vi_dcc_formats_are_incompatible(texture,
-                                               state->u.tex.first_level,
-                                               state->format);
-
-       sctx->screen->make_texture_descriptor(sctx->screen, tex, true,
-                                  state->target, pipe_format, state_swizzle,
-                                  first_level, last_level,
-                                  state->u.tex.first_layer, last_layer,
-                                  width, height, depth,
-                                  view->state, view->fmask_state);
-
-       const struct util_format_description *desc = util_format_description(pipe_format);
-       view->is_integer = false;
-
-       for (unsigned i = 0; i < desc->nr_channels; ++i) {
-               if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID)
-                       continue;
-
-               /* Whether the number format is {U,S}{SCALED,INT} */
-               view->is_integer =
-                       (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
-                        desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) &&
-                       (desc->channel[i].pure_integer || !desc->channel[i].normalized);
-               break;
-       }
-
-       view->base_level_info = &surflevel[base_level];
-       view->base_level = base_level;
-       view->block_width = util_format_get_blockwidth(pipe_format);
-       return &view->base;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
+   struct si_texture *tex = (struct si_texture *)texture;
+   unsigned base_level, first_level, last_level;
+   unsigned char state_swizzle[4];
+   unsigned height, depth, width;
+   unsigned last_layer = state->u.tex.last_layer;
+   enum pipe_format pipe_format;
+   const struct legacy_surf_level *surflevel;
+
+   if (!view)
+      return NULL;
+
+   /* initialize base object */
+   view->base = *state;
+   view->base.texture = NULL;
+   view->base.reference.count = 1;
+   view->base.context = ctx;
+
+   assert(texture);
+   pipe_resource_reference(&view->base.texture, texture);
+
+   if (state->format == PIPE_FORMAT_X24S8_UINT || state->format == PIPE_FORMAT_S8X24_UINT ||
+       state->format == PIPE_FORMAT_X32_S8X24_UINT || state->format == PIPE_FORMAT_S8_UINT)
+      view->is_stencil_sampler = true;
+
+   /* Buffer resource. */
+   if (texture->target == PIPE_BUFFER) {
+      si_make_buffer_descriptor(sctx->screen, si_resource(texture), state->format,
+                                state->u.buf.offset, state->u.buf.size, view->state);
+      return &view->base;
+   }
+
+   state_swizzle[0] = state->swizzle_r;
+   state_swizzle[1] = state->swizzle_g;
+   state_swizzle[2] = state->swizzle_b;
+   state_swizzle[3] = state->swizzle_a;
+
+   base_level = 0;
+   first_level = state->u.tex.first_level;
+   last_level = state->u.tex.last_level;
+   width = width0;
+   height = height0;
+   depth = texture->depth0;
+
+   if (sctx->chip_class <= GFX8 && force_level) {
+      assert(force_level == first_level && force_level == last_level);
+      base_level = force_level;
+      first_level = 0;
+      last_level = 0;
+      width = u_minify(width, force_level);
+      height = u_minify(height, force_level);
+      depth = u_minify(depth, force_level);
+   }
+
+   /* This is not needed if state trackers set last_layer correctly. */
+   if (state->target == PIPE_TEXTURE_1D || state->target == PIPE_TEXTURE_2D ||
+       state->target == PIPE_TEXTURE_RECT || state->target == PIPE_TEXTURE_CUBE)
+      last_layer = state->u.tex.first_layer;
+
+   /* Texturing with separate depth and stencil. */
+   pipe_format = state->format;
+
+   /* Depth/stencil texturing sometimes needs separate texture. */
+   if (tex->is_depth && !si_can_sample_zs(tex, view->is_stencil_sampler)) {
+      if (!tex->flushed_depth_texture && !si_init_flushed_depth_texture(ctx, texture)) {
+         pipe_resource_reference(&view->base.texture, NULL);
+         FREE(view);
+         return NULL;
+      }
+
+      assert(tex->flushed_depth_texture);
+
+      /* Override format for the case where the flushed texture
+       * contains only Z or only S.
+       */
+      if (tex->flushed_depth_texture->buffer.b.b.format != tex->buffer.b.b.format)
+         pipe_format = tex->flushed_depth_texture->buffer.b.b.format;
+
+      tex = tex->flushed_depth_texture;
+   }
+
+   surflevel = tex->surface.u.legacy.level;
+
+   if (tex->db_compatible) {
+      if (!view->is_stencil_sampler)
+         pipe_format = tex->db_render_format;
+
+      switch (pipe_format) {
+      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+         pipe_format = PIPE_FORMAT_Z32_FLOAT;
+         break;
+      case PIPE_FORMAT_X8Z24_UNORM:
+      case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+         /* Z24 is always stored like this for DB
+          * compatibility.
+          */
+         pipe_format = PIPE_FORMAT_Z24X8_UNORM;
+         break;
+      case PIPE_FORMAT_X24S8_UINT:
+      case PIPE_FORMAT_S8X24_UINT:
+      case PIPE_FORMAT_X32_S8X24_UINT:
+         pipe_format = PIPE_FORMAT_S8_UINT;
+         surflevel = tex->surface.u.legacy.stencil_level;
+         break;
+      default:;
+      }
+   }
+
+   view->dcc_incompatible =
+      vi_dcc_formats_are_incompatible(texture, state->u.tex.first_level, state->format);
+
+   sctx->screen->make_texture_descriptor(
+      sctx->screen, tex, true, state->target, pipe_format, state_swizzle, first_level, last_level,
+      state->u.tex.first_layer, last_layer, width, height, depth, view->state, view->fmask_state);
+
+   const struct util_format_description *desc = util_format_description(pipe_format);
+   view->is_integer = false;
+
+   for (unsigned i = 0; i < desc->nr_channels; ++i) {
+      if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID)
+         continue;
+
+      /* Whether the number format is {U,S}{SCALED,INT} */
+      view->is_integer = (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
+                          desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) &&
+                         (desc->channel[i].pure_integer || !desc->channel[i].normalized);
+      break;
+   }
+
+   view->base_level_info = &surflevel[base_level];
+   view->base_level = base_level;
+   view->block_width = util_format_get_blockwidth(pipe_format);
+   return &view->base;
  }
  
-static struct pipe_sampler_view *
-si_create_sampler_view(struct pipe_context *ctx,
-                      struct pipe_resource *texture,
-                      const struct pipe_sampler_view *state)
+static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx,
+                                                        struct pipe_resource *texture,
+                                                        const struct pipe_sampler_view *state)
  {
-       return si_create_sampler_view_custom(ctx, texture, state,
-                                            texture ? texture->width0 : 0,
-                                            texture ? texture->height0 : 0, 0);
+   return si_create_sampler_view_custom(ctx, texture, state, texture ? texture->width0 : 0,
+                                        texture ? texture->height0 : 0, 0);
  }
  
-static void si_sampler_view_destroy(struct pipe_context *ctx,
-                                   struct pipe_sampler_view *state)
+static void si_sampler_view_destroy(struct pipe_context *ctx, struct pipe_sampler_view *state)
  {
-       struct si_sampler_view *view = (struct si_sampler_view *)state;
+   struct si_sampler_view *view = (struct si_sampler_view *)state;
  
-       pipe_resource_reference(&state->texture, NULL);
-       FREE(view);
+   pipe_resource_reference(&state->texture, NULL);
+   FREE(view);
  }
  
  static bool wrap_mode_uses_border_color(unsigned wrap, bool linear_filter)
  {
-       return wrap == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
-              wrap == PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER ||
-              (linear_filter &&
-               (wrap == PIPE_TEX_WRAP_CLAMP ||
-                wrap == PIPE_TEX_WRAP_MIRROR_CLAMP));
+   return wrap == PIPE_TEX_WRAP_CLAMP_TO_BORDER || wrap == PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER ||
+          (linear_filter && (wrap == PIPE_TEX_WRAP_CLAMP || wrap == PIPE_TEX_WRAP_MIRROR_CLAMP));
  }
  
  static uint32_t si_translate_border_color(struct si_context *sctx,
-                                         const struct pipe_sampler_state *state,
-                                         const union pipe_color_union *color,
-                                         bool is_integer)
+                                          const struct pipe_sampler_state *state,
+                                          const union pipe_color_union *color, bool is_integer)
  {
-       bool linear_filter = state->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
-                            state->mag_img_filter != PIPE_TEX_FILTER_NEAREST;
-
-       if (!wrap_mode_uses_border_color(state->wrap_s, linear_filter) &&
-           !wrap_mode_uses_border_color(state->wrap_t, linear_filter) &&
-           !wrap_mode_uses_border_color(state->wrap_r, linear_filter))
-               return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
-
-#define simple_border_types(elt) \
-do { \
-       if (color->elt[0] == 0 && color->elt[1] == 0 &&                         \
-           color->elt[2] == 0 && color->elt[3] == 0)                           \
-               return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); \
-       if (color->elt[0] == 0 && color->elt[1] == 0 &&                         \
-           color->elt[2] == 0 && color->elt[3] == 1)                           \
-               return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK); \
-       if (color->elt[0] == 1 && color->elt[1] == 1 &&                         \
-           color->elt[2] == 1 && color->elt[3] == 1)                           \
-               return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE); \
-} while (false)
-
-       if (is_integer)
-               simple_border_types(ui);
-       else
-               simple_border_types(f);
+   bool linear_filter = state->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
+                        state->mag_img_filter != PIPE_TEX_FILTER_NEAREST;
+
+   if (!wrap_mode_uses_border_color(state->wrap_s, linear_filter) &&
+       !wrap_mode_uses_border_color(state->wrap_t, linear_filter) &&
+       !wrap_mode_uses_border_color(state->wrap_r, linear_filter))
+      return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
+
+#define simple_border_types(elt)                                                                   \
+   do {                                                                                            \
+      if (color->elt[0] == 0 && color->elt[1] == 0 && color->elt[2] == 0 && color->elt[3] == 0)    \
+         return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);              \
+      if (color->elt[0] == 0 && color->elt[1] == 0 && color->elt[2] == 0 && color->elt[3] == 1)    \
+         return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK);             \
+      if (color->elt[0] == 1 && color->elt[1] == 1 && color->elt[2] == 1 && color->elt[3] == 1)    \
+         return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE);             \
+   } while (false)
+
+   if (is_integer)
+      simple_border_types(ui);
+   else
+      simple_border_types(f);
  
  #undef simple_border_types
  
-       int i;
-
-       /* Check if the border has been uploaded already. */
-       for (i = 0; i < sctx->border_color_count; i++)
-               if (memcmp(&sctx->border_color_table[i], color,
-                          sizeof(*color)) == 0)
-                       break;
-
-       if (i >= SI_MAX_BORDER_COLORS) {
-               /* Getting 4096 unique border colors is very unlikely. */
-               fprintf(stderr, "radeonsi: The border color table is full. "
-                       "Any new border colors will be just black. "
-                       "Please file a bug.\n");
-               return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
-       }
-
-       if (i == sctx->border_color_count) {
-               /* Upload a new border color. */
-               memcpy(&sctx->border_color_table[i], color,
-                      sizeof(*color));
-               util_memcpy_cpu_to_le32(&sctx->border_color_map[i],
-                                       color, sizeof(*color));
-               sctx->border_color_count++;
-       }
-
-       return S_008F3C_BORDER_COLOR_PTR(i) |
-              S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER);
+   int i;
+
+   /* Check if the border has been uploaded already. */
+   for (i = 0; i < sctx->border_color_count; i++)
+      if (memcmp(&sctx->border_color_table[i], color, sizeof(*color)) == 0)
+         break;
+
+   if (i >= SI_MAX_BORDER_COLORS) {
+      /* Getting 4096 unique border colors is very unlikely. */
+      fprintf(stderr, "radeonsi: The border color table is full. "
+                      "Any new border colors will be just black. "
+                      "Please file a bug.\n");
+      return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
+   }
+
+   if (i == sctx->border_color_count) {
+      /* Upload a new border color. */
+      memcpy(&sctx->border_color_table[i], color, sizeof(*color));
+      util_memcpy_cpu_to_le32(&sctx->border_color_map[i], color, sizeof(*color));
+      sctx->border_color_count++;
+   }
+
+   return S_008F3C_BORDER_COLOR_PTR(i) |
+          S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER);
  }
  
  static inline int S_FIXED(float value, unsigned frac_bits)
  {
-       return value * (1 << frac_bits);
+   return value * (1 << frac_bits);
  }
  
  static inline unsigned si_tex_filter(unsigned filter, unsigned max_aniso)
  {
-       if (filter == PIPE_TEX_FILTER_LINEAR)
-               return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR
-                                    : V_008F38_SQ_TEX_XY_FILTER_BILINEAR;
-       else
-               return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT
-                                    : V_008F38_SQ_TEX_XY_FILTER_POINT;
+   if (filter == PIPE_TEX_FILTER_LINEAR)
+      return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR
+                           : V_008F38_SQ_TEX_XY_FILTER_BILINEAR;
+   else
+      return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT
+                           : V_008F38_SQ_TEX_XY_FILTER_POINT;
  }
  
  static inline unsigned si_tex_aniso_filter(unsigned filter)
  {
-       if (filter < 2)
-               return 0;
-       if (filter < 4)
-               return 1;
-       if (filter < 8)
-               return 2;
-       if (filter < 16)
-               return 3;
-       return 4;
+   if (filter < 2)
+      return 0;
+   if (filter < 4)
+      return 1;
+   if (filter < 8)
+      return 2;
+   if (filter < 16)
+      return 3;
+   return 4;
  }
  
  static void *si_create_sampler_state(struct pipe_context *ctx,
-                                    const struct pipe_sampler_state *state)
+                                     const struct pipe_sampler_state *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_screen *sscreen = sctx->screen;
-       struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state);
-       unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso
-                                                      : state->max_anisotropy;
-       unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso);
-       union pipe_color_union clamped_border_color;
-
-       if (!rstate) {
-               return NULL;
-       }
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_screen *sscreen = sctx->screen;
+   struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state);
+   unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso : state->max_anisotropy;
+   unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso);
+   union pipe_color_union clamped_border_color;
+
+   if (!rstate) {
+      return NULL;
+   }
  
  #ifndef NDEBUG
-       rstate->magic = SI_SAMPLER_STATE_MAGIC;
+   rstate->magic = SI_SAMPLER_STATE_MAGIC;
  #endif
-       rstate->val[0] = (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) |
-                         S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) |
-                         S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) |
-                         S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) |
-                         S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) |
-                         S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) |
-                         S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) |
-                         S_008F30_ANISO_BIAS(max_aniso_ratio) |
-                         S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) |
-                         S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9));
-       rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) |
-                         S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) |
-                         S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0));
-       rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) |
-                         S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) |
-                         S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) |
-                         S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) |
-                         S_008F38_MIP_POINT_PRECLAMP(0));
-       rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, false);
-
-       if (sscreen->info.chip_class >= GFX10) {
-               rstate->val[2] |= S_008F38_ANISO_OVERRIDE_GFX10(1);
-       } else {
-               rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) |
-                                 S_008F38_FILTER_PREC_FIX(1) |
-                                 S_008F38_ANISO_OVERRIDE_GFX6(sctx->chip_class >= GFX8);
-       }
-
-       /* Create sampler resource for integer textures. */
-       memcpy(rstate->integer_val, rstate->val, sizeof(rstate->val));
-       rstate->integer_val[3] = si_translate_border_color(sctx, state, &state->border_color, true);
-
-       /* Create sampler resource for upgraded depth textures. */
-       memcpy(rstate->upgraded_depth_val, rstate->val, sizeof(rstate->val));
-
-       for (unsigned i = 0; i < 4; ++i) {
-               /* Use channel 0 on purpose, so that we can use OPAQUE_WHITE
-                * when the border color is 1.0. */
-               clamped_border_color.f[i] = CLAMP(state->border_color.f[0], 0, 1);
-       }
-
-       if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0) {
-               if (sscreen->info.chip_class <= GFX9)
-                       rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1);
-       } else {
-               rstate->upgraded_depth_val[3] =
-                       si_translate_border_color(sctx, state, &clamped_border_color, false);
-       }
-
-       return rstate;
+   rstate->val[0] =
+      (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) | S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) |
+       S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) | S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) |
+       S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) |
+       S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) |
+       S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) | S_008F30_ANISO_BIAS(max_aniso_ratio) |
+       S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) |
+       S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9));
+   rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) |
+                     S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) |
+                     S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0));
+   rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) |
+                     S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) |
+                     S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) |
+                     S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) |
+                     S_008F38_MIP_POINT_PRECLAMP(0));
+   rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, false);
+
+   if (sscreen->info.chip_class >= GFX10) {
+      rstate->val[2] |= S_008F38_ANISO_OVERRIDE_GFX10(1);
+   } else {
+      rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) |
+                        S_008F38_FILTER_PREC_FIX(1) |
+                        S_008F38_ANISO_OVERRIDE_GFX6(sctx->chip_class >= GFX8);
+   }
+
+   /* Create sampler resource for integer textures. */
+   memcpy(rstate->integer_val, rstate->val, sizeof(rstate->val));
+   rstate->integer_val[3] = si_translate_border_color(sctx, state, &state->border_color, true);
+
+   /* Create sampler resource for upgraded depth textures. */
+   memcpy(rstate->upgraded_depth_val, rstate->val, sizeof(rstate->val));
+
+   for (unsigned i = 0; i < 4; ++i) {
+      /* Use channel 0 on purpose, so that we can use OPAQUE_WHITE
+       * when the border color is 1.0. */
+      clamped_border_color.f[i] = CLAMP(state->border_color.f[0], 0, 1);
+   }
+
+   if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0) {
+      if (sscreen->info.chip_class <= GFX9)
+         rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1);
+   } else {
+      rstate->upgraded_depth_val[3] =
+         si_translate_border_color(sctx, state, &clamped_border_color, false);
+   }
+
+   return rstate;
  }
  
  static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
  
-       if (sctx->sample_mask == (uint16_t)sample_mask)
-               return;
+   if (sctx->sample_mask == (uint16_t)sample_mask)
+      return;
  
-       sctx->sample_mask = sample_mask;
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask);
+   sctx->sample_mask = sample_mask;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask);
  }
  
  static void si_emit_sample_mask(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       unsigned mask = sctx->sample_mask;
-
-       /* Needed for line and polygon smoothing as well as for the Polaris
-        * small primitive filter. We expect the state tracker to take care of
-        * this for us.
-        */
-       assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 ||
-              (mask & 1 && sctx->blitter->running));
-
-       radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
-       radeon_emit(cs, mask | (mask << 16));
-       radeon_emit(cs, mask | (mask << 16));
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned mask = sctx->sample_mask;
+
+   /* Needed for line and polygon smoothing as well as for the Polaris
+    * small primitive filter. We expect the state tracker to take care of
+    * this for us.
+    */
+   assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 ||
+          (mask & 1 && sctx->blitter->running));
+
+   radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
+   radeon_emit(cs, mask | (mask << 16));
+   radeon_emit(cs, mask | (mask << 16));
  }
  
  static void si_delete_sampler_state(struct pipe_context *ctx, void *state)
  {
  #ifndef NDEBUG
-       struct si_sampler_state *s = state;
+   struct si_sampler_state *s = state;
  
-       assert(s->magic == SI_SAMPLER_STATE_MAGIC);
-       s->magic = 0;
+   assert(s->magic == SI_SAMPLER_STATE_MAGIC);
+   s->magic = 0;
  #endif
-       free(state);
+   free(state);
  }
  
  /*
   * Vertex elements & buffers
   */
  
-struct si_fast_udiv_info32
-si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits)
+struct si_fast_udiv_info32 si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits)
  {
-       struct util_fast_udiv_info info =
-               util_compute_fast_udiv_info(D, num_bits, 32);
-
-       struct si_fast_udiv_info32 result = {
-               info.multiplier,
-               info.pre_shift,
-               info.post_shift,
-               info.increment,
-       };
-       return result;
+   struct util_fast_udiv_info info = util_compute_fast_udiv_info(D, num_bits, 32);
+
+   struct si_fast_udiv_info32 result = {
+      info.multiplier,
+      info.pre_shift,
+      info.post_shift,
+      info.increment,
+   };
+   return result;
  }
  
-static void *si_create_vertex_elements(struct pipe_context *ctx,
-                                      unsigned count,
-                                      const struct pipe_vertex_element *elements)
+static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
+                                       const struct pipe_vertex_element *elements)
  {
-       struct si_screen *sscreen = (struct si_screen*)ctx->screen;
-       struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
-       bool used[SI_NUM_VERTEX_BUFFERS] = {};
-       struct si_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {};
-       STATIC_ASSERT(sizeof(struct si_fast_udiv_info32) == 16);
-       STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4);
-       STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4);
-       STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4);
-       STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4);
-       int i;
-
-       assert(count <= SI_MAX_ATTRIBS);
-       if (!v)
-               return NULL;
-
-       v->count = count;
-
-       unsigned alloc_count = count > sscreen->num_vbos_in_user_sgprs ?
-                              count - sscreen->num_vbos_in_user_sgprs : 0;
-       v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT);
-
-       for (i = 0; i < count; ++i) {
-               const struct util_format_description *desc;
-               const struct util_format_channel_description *channel;
-               int first_non_void;
-               unsigned vbo_index = elements[i].vertex_buffer_index;
-
-               if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
-                       FREE(v);
-                       return NULL;
-               }
-
-               unsigned instance_divisor = elements[i].instance_divisor;
-               if (instance_divisor) {
-                       v->uses_instance_divisors = true;
-
-                       if (instance_divisor == 1) {
-                               v->instance_divisor_is_one |= 1u << i;
-                       } else {
-                               v->instance_divisor_is_fetched |= 1u << i;
-                               divisor_factors[i] =
-                                       si_compute_fast_udiv_info32(instance_divisor, 32);
-                       }
-               }
-
-               if (!used[vbo_index]) {
-                       v->first_vb_use_mask |= 1 << i;
-                       used[vbo_index] = true;
-               }
-
-               desc = util_format_description(elements[i].src_format);
-               first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
-               channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
-
-               v->format_size[i] = desc->block.bits / 8;
-               v->src_offset[i] = elements[i].src_offset;
-               v->vertex_buffer_index[i] = vbo_index;
-
-               bool always_fix = false;
-               union si_vs_fix_fetch fix_fetch;
-               unsigned log_hw_load_size; /* the load element size as seen by the hardware */
-
-               fix_fetch.bits = 0;
-               log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3);
-
-               if (channel) {
-                       switch (channel->type) {
-                       case UTIL_FORMAT_TYPE_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
-                       case UTIL_FORMAT_TYPE_FIXED: fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; break;
-                       case UTIL_FORMAT_TYPE_SIGNED: {
-                               if (channel->pure_integer)
-                                       fix_fetch.u.format = AC_FETCH_FORMAT_SINT;
-                               else if (channel->normalized)
-                                       fix_fetch.u.format = AC_FETCH_FORMAT_SNORM;
-                               else
-                                       fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED;
-                               break;
-                       }
-                       case UTIL_FORMAT_TYPE_UNSIGNED: {
-                               if (channel->pure_integer)
-                                       fix_fetch.u.format = AC_FETCH_FORMAT_UINT;
-                               else if (channel->normalized)
-                                       fix_fetch.u.format = AC_FETCH_FORMAT_UNORM;
-                               else
-                                       fix_fetch.u.format = AC_FETCH_FORMAT_USCALED;
-                               break;
-                       }
-                       default: unreachable("bad format type");
-                       }
-               } else {
-                       switch (elements[i].src_format) {
-                       case PIPE_FORMAT_R11G11B10_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
-                       default: unreachable("bad other format");
-                       }
-               }
-
-               if (desc->channel[0].size == 10) {
-                       fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */
-                       log_hw_load_size = 2;
-
-                       /* The hardware always treats the 2-bit alpha channel as
-                        * unsigned, so a shader workaround is needed. The affected
-                        * chips are GFX8 and older except Stoney (GFX8.1).
-                        */
-                       always_fix = sscreen->info.chip_class <= GFX8 &&
-                                    sscreen->info.family != CHIP_STONEY &&
-                                    channel->type == UTIL_FORMAT_TYPE_SIGNED;
-               } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) {
-                       fix_fetch.u.log_size = 3; /* special encoding */
-                       fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
-                       log_hw_load_size = 2;
-               } else {
-                       fix_fetch.u.log_size = util_logbase2(channel->size) - 3;
-                       fix_fetch.u.num_channels_m1 = desc->nr_channels - 1;
-
-                       /* Always fix up:
-                        * - doubles (multiple loads + truncate to float)
-                        * - 32-bit requiring a conversion
-                        */
-                       always_fix =
-                               (fix_fetch.u.log_size == 3) ||
-                               (fix_fetch.u.log_size == 2 &&
-                                fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT &&
-                                fix_fetch.u.format != AC_FETCH_FORMAT_UINT &&
-                                fix_fetch.u.format != AC_FETCH_FORMAT_SINT);
-
-                       /* Also fixup 8_8_8 and 16_16_16. */
-                       if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) {
-                               always_fix = true;
-                               log_hw_load_size = fix_fetch.u.log_size;
-                       }
-               }
-
-               if (desc->swizzle[0] != PIPE_SWIZZLE_X) {
-                       assert(desc->swizzle[0] == PIPE_SWIZZLE_Z &&
-                              (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0));
-                       fix_fetch.u.reverse = 1;
-               }
-
-               /* Force the workaround for unaligned access here already if the
-                * offset relative to the vertex buffer base is unaligned.
-                *
-                * There is a theoretical case in which this is too conservative:
-                * if the vertex buffer's offset is also unaligned in just the
-                * right way, we end up with an aligned address after all.
-                * However, this case should be extremely rare in practice (it
-                * won't happen in well-behaved applications), and taking it
-                * into account would complicate the fast path (where everything
-                * is nicely aligned).
-                */
-               bool check_alignment =
-                       log_hw_load_size >= 1 &&
-                       (sscreen->info.chip_class == GFX6 || sscreen->info.chip_class == GFX10);
-               bool opencode = sscreen->options.vs_fetch_always_opencode;
-
-               if (check_alignment &&
-                   (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
-                       opencode = true;
-
-               if (always_fix || check_alignment || opencode)
-                       v->fix_fetch[i] = fix_fetch.bits;
-
-               if (opencode)
-                       v->fix_fetch_opencode |= 1 << i;
-               if (opencode || always_fix)
-                       v->fix_fetch_always |= 1 << i;
-
-               if (check_alignment && !opencode) {
-                       assert(log_hw_load_size == 1 || log_hw_load_size == 2);
-
-                       v->fix_fetch_unaligned |= 1 << i;
-                       v->hw_load_is_dword |= (log_hw_load_size - 1) << i;
-                       v->vb_alignment_check_mask |= 1 << vbo_index;
-               }
-
-               v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
-                                  S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
-                                  S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
-                                  S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
-
-               if (sscreen->info.chip_class >= GFX10) {
-                       const struct gfx10_format *fmt =
-                               &gfx10_format_table[elements[i].src_format];
-                       assert(fmt->img_format != 0 && fmt->img_format < 128);
-                       v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) |
-                                           S_008F0C_RESOURCE_LEVEL(1);
-               } else {
-                       unsigned data_format, num_format;
-                       data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
-                       num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
-                       v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) |
-                                           S_008F0C_DATA_FORMAT(data_format);
-               }
-       }
-
-       if (v->instance_divisor_is_fetched) {
-               unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched);
-
-               v->instance_divisor_factor_buffer =
-                       (struct si_resource*)
-                       pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
-                                          num_divisors * sizeof(divisor_factors[0]));
-               if (!v->instance_divisor_factor_buffer) {
-                       FREE(v);
-                       return NULL;
-               }
-               void *map = sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf,
-                                                   NULL, PIPE_TRANSFER_WRITE);
-               memcpy(map , divisor_factors, num_divisors * sizeof(divisor_factors[0]));
-       }
-       return v;
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+   struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
+   bool used[SI_NUM_VERTEX_BUFFERS] = {};
+   struct si_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {};
+   STATIC_ASSERT(sizeof(struct si_fast_udiv_info32) == 16);
+   STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4);
+   STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4);
+   STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4);
+   STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4);
+   int i;
+
+   assert(count <= SI_MAX_ATTRIBS);
+   if (!v)
+      return NULL;
+
+   v->count = count;
+
+   unsigned alloc_count =
+      count > sscreen->num_vbos_in_user_sgprs ? count - sscreen->num_vbos_in_user_sgprs : 0;
+   v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT);
+
+   for (i = 0; i < count; ++i) {
+      const struct util_format_description *desc;
+      const struct util_format_channel_description *channel;
+      int first_non_void;
+      unsigned vbo_index = elements[i].vertex_buffer_index;
+
+      if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
+         FREE(v);
+         return NULL;
+      }
+
+      unsigned instance_divisor = elements[i].instance_divisor;
+      if (instance_divisor) {
+         v->uses_instance_divisors = true;
+
+         if (instance_divisor == 1) {
+            v->instance_divisor_is_one |= 1u << i;
+         } else {
+            v->instance_divisor_is_fetched |= 1u << i;
+            divisor_factors[i] = si_compute_fast_udiv_info32(instance_divisor, 32);
+         }
+      }
+
+      if (!used[vbo_index]) {
+         v->first_vb_use_mask |= 1 << i;
+         used[vbo_index] = true;
+      }
+
+      desc = util_format_description(elements[i].src_format);
+      first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
+      channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
+
+      v->format_size[i] = desc->block.bits / 8;
+      v->src_offset[i] = elements[i].src_offset;
+      v->vertex_buffer_index[i] = vbo_index;
+
+      bool always_fix = false;
+      union si_vs_fix_fetch fix_fetch;
+      unsigned log_hw_load_size; /* the load element size as seen by the hardware */
+
+      fix_fetch.bits = 0;
+      log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3);
+
+      if (channel) {
+         switch (channel->type) {
+         case UTIL_FORMAT_TYPE_FLOAT:
+            fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT;
+            break;
+         case UTIL_FORMAT_TYPE_FIXED:
+            fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
+            break;
+         case UTIL_FORMAT_TYPE_SIGNED: {
+            if (channel->pure_integer)
+               fix_fetch.u.format = AC_FETCH_FORMAT_SINT;
+            else if (channel->normalized)
+               fix_fetch.u.format = AC_FETCH_FORMAT_SNORM;
+            else
+               fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED;
+            break;
+         }
+         case UTIL_FORMAT_TYPE_UNSIGNED: {
+            if (channel->pure_integer)
+               fix_fetch.u.format = AC_FETCH_FORMAT_UINT;
+            else if (channel->normalized)
+               fix_fetch.u.format = AC_FETCH_FORMAT_UNORM;
+            else
+               fix_fetch.u.format = AC_FETCH_FORMAT_USCALED;
+            break;
+         }
+         default:
+            unreachable("bad format type");
+         }
+      } else {
+         switch (elements[i].src_format) {
+         case PIPE_FORMAT_R11G11B10_FLOAT:
+            fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT;
+            break;
+         default:
+            unreachable("bad other format");
+         }
+      }
+
+      if (desc->channel[0].size == 10) {
+         fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */
+         log_hw_load_size = 2;
+
+         /* The hardware always treats the 2-bit alpha channel as
+          * unsigned, so a shader workaround is needed. The affected
+          * chips are GFX8 and older except Stoney (GFX8.1).
+          */
+         always_fix = sscreen->info.chip_class <= GFX8 && sscreen->info.family != CHIP_STONEY &&
+                      channel->type == UTIL_FORMAT_TYPE_SIGNED;
+      } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) {
+         fix_fetch.u.log_size = 3; /* special encoding */
+         fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
+         log_hw_load_size = 2;
+      } else {
+         fix_fetch.u.log_size = util_logbase2(channel->size) - 3;
+         fix_fetch.u.num_channels_m1 = desc->nr_channels - 1;
+
+         /* Always fix up:
+          * - doubles (multiple loads + truncate to float)
+          * - 32-bit requiring a conversion
+          */
+         always_fix = (fix_fetch.u.log_size == 3) ||
+                      (fix_fetch.u.log_size == 2 && fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT &&
+                       fix_fetch.u.format != AC_FETCH_FORMAT_UINT &&
+                       fix_fetch.u.format != AC_FETCH_FORMAT_SINT);
+
+         /* Also fixup 8_8_8 and 16_16_16. */
+         if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) {
+            always_fix = true;
+            log_hw_load_size = fix_fetch.u.log_size;
+         }
+      }
+
+      if (desc->swizzle[0] != PIPE_SWIZZLE_X) {
+         assert(desc->swizzle[0] == PIPE_SWIZZLE_Z &&
+                (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0));
+         fix_fetch.u.reverse = 1;
+      }
+
+      /* Force the workaround for unaligned access here already if the
+       * offset relative to the vertex buffer base is unaligned.
+       *
+       * There is a theoretical case in which this is too conservative:
+       * if the vertex buffer's offset is also unaligned in just the
+       * right way, we end up with an aligned address after all.
+       * However, this case should be extremely rare in practice (it
+       * won't happen in well-behaved applications), and taking it
+       * into account would complicate the fast path (where everything
+       * is nicely aligned).
+       */
+      bool check_alignment = log_hw_load_size >= 1 && (sscreen->info.chip_class == GFX6 ||
+                                                       sscreen->info.chip_class == GFX10);
+      bool opencode = sscreen->options.vs_fetch_always_opencode;
+
+      if (check_alignment && (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
+         opencode = true;
+
+      if (always_fix || check_alignment || opencode)
+         v->fix_fetch[i] = fix_fetch.bits;
+
+      if (opencode)
+         v->fix_fetch_opencode |= 1 << i;
+      if (opencode || always_fix)
+         v->fix_fetch_always |= 1 << i;
+
+      if (check_alignment && !opencode) {
+         assert(log_hw_load_size == 1 || log_hw_load_size == 2);
+
+         v->fix_fetch_unaligned |= 1 << i;
+         v->hw_load_is_dword |= (log_hw_load_size - 1) << i;
+         v->vb_alignment_check_mask |= 1 << vbo_index;
+      }
+
+      v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+                         S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+                         S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+                         S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
+
+      if (sscreen->info.chip_class >= GFX10) {
+         const struct gfx10_format *fmt = &gfx10_format_table[elements[i].src_format];
+         assert(fmt->img_format != 0 && fmt->img_format < 128);
+         v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) | S_008F0C_RESOURCE_LEVEL(1);
+      } else {
+         unsigned data_format, num_format;
+         data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
+         num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
+         v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format);
+      }
+   }
+
+   if (v->instance_divisor_is_fetched) {
+      unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched);
+
+      v->instance_divisor_factor_buffer = (struct si_resource *)pipe_buffer_create(
+         &sscreen->b, 0, PIPE_USAGE_DEFAULT, num_divisors * sizeof(divisor_factors[0]));
+      if (!v->instance_divisor_factor_buffer) {
+         FREE(v);
+         return NULL;
+      }
+      void *map =
+         sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf, NULL, PIPE_TRANSFER_WRITE);
+      memcpy(map, divisor_factors, num_divisors * sizeof(divisor_factors[0]));
+   }
+   return v;
  }
  
  static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_vertex_elements *old = sctx->vertex_elements;
-       struct si_vertex_elements *v = (struct si_vertex_elements*)state;
-
-       sctx->vertex_elements = v;
-       sctx->num_vertex_elements = v ? v->count : 0;
-
-       if (sctx->num_vertex_elements) {
-               sctx->vertex_buffers_dirty = true;
-       } else {
-               sctx->vertex_buffer_pointer_dirty = false;
-               sctx->vertex_buffer_user_sgprs_dirty = false;
-       }
-
-       if (v &&
-           (!old ||
-            old->count != v->count ||
-            old->uses_instance_divisors != v->uses_instance_divisors ||
-            /* we don't check which divisors changed */
-            v->uses_instance_divisors ||
-            (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) & sctx->vertex_buffer_unaligned ||
-            ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
-             memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
-                    sizeof(v->vertex_buffer_index[0]) * v->count)) ||
-            /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
-             * functions of fix_fetch and the src_offset alignment.
-             * If they change and fix_fetch doesn't, it must be due to different
-             * src_offset alignment, which is reflected in fix_fetch_opencode. */
-            old->fix_fetch_opencode != v->fix_fetch_opencode ||
-            memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
-               sctx->do_update_shaders = true;
-
-       if (v && v->instance_divisor_is_fetched) {
-               struct pipe_constant_buffer cb;
-
-               cb.buffer = &v->instance_divisor_factor_buffer->b.b;
-               cb.user_buffer = NULL;
-               cb.buffer_offset = 0;
-               cb.buffer_size = 0xffffffff;
-               si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
-       }
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_vertex_elements *old = sctx->vertex_elements;
+   struct si_vertex_elements *v = (struct si_vertex_elements *)state;
+
+   sctx->vertex_elements = v;
+   sctx->num_vertex_elements = v ? v->count : 0;
+
+   if (sctx->num_vertex_elements) {
+      sctx->vertex_buffers_dirty = true;
+   } else {
+      sctx->vertex_buffer_pointer_dirty = false;
+      sctx->vertex_buffer_user_sgprs_dirty = false;
+   }
+
+   if (v && (!old || old->count != v->count ||
+             old->uses_instance_divisors != v->uses_instance_divisors ||
+             /* we don't check which divisors changed */
+             v->uses_instance_divisors ||
+             (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) &
+                sctx->vertex_buffer_unaligned ||
+             ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
+              memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
+                     sizeof(v->vertex_buffer_index[0]) * v->count)) ||
+             /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
+              * functions of fix_fetch and the src_offset alignment.
+              * If they change and fix_fetch doesn't, it must be due to different
+              * src_offset alignment, which is reflected in fix_fetch_opencode. */
+             old->fix_fetch_opencode != v->fix_fetch_opencode ||
+             memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
+      sctx->do_update_shaders = true;
+
+   if (v && v->instance_divisor_is_fetched) {
+      struct pipe_constant_buffer cb;
+
+      cb.buffer = &v->instance_divisor_factor_buffer->b.b;
+      cb.user_buffer = NULL;
+      cb.buffer_offset = 0;
+      cb.buffer_size = 0xffffffff;
+      si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
+   }
  }
  
  static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_vertex_elements *v = (struct si_vertex_elements*)state;
-
-       if (sctx->vertex_elements == state) {
-               sctx->vertex_elements = NULL;
-               sctx->num_vertex_elements = 0;
-       }
-       si_resource_reference(&v->instance_divisor_factor_buffer, NULL);
-       FREE(state);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_vertex_elements *v = (struct si_vertex_elements *)state;
+
+   if (sctx->vertex_elements == state) {
+      sctx->vertex_elements = NULL;
+      sctx->num_vertex_elements = 0;
+   }
+   si_resource_reference(&v->instance_divisor_factor_buffer, NULL);
+   FREE(state);
  }
  
-static void si_set_vertex_buffers(struct pipe_context *ctx,
-                                 unsigned start_slot, unsigned count,
-                                 const struct pipe_vertex_buffer *buffers)
+static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot, unsigned count,
+                                  const struct pipe_vertex_buffer *buffers)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
-       unsigned updated_mask = u_bit_consecutive(start_slot, count);
-       uint32_t orig_unaligned = sctx->vertex_buffer_unaligned;
-       uint32_t unaligned = 0;
-       int i;
-
-       assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer));
-
-       if (buffers) {
-               for (i = 0; i < count; i++) {
-                       const struct pipe_vertex_buffer *src = buffers + i;
-                       struct pipe_vertex_buffer *dsti = dst + i;
-                       struct pipe_resource *buf = src->buffer.resource;
-                       unsigned slot_bit = 1 << (start_slot + i);
-
-                       pipe_resource_reference(&dsti->buffer.resource, buf);
-                       dsti->buffer_offset = src->buffer_offset;
-                       dsti->stride = src->stride;
-
-                       if (dsti->buffer_offset & 3 || dsti->stride & 3)
-                               unaligned |= slot_bit;
-
-                       si_context_add_resource_size(sctx, buf);
-                       if (buf)
-                               si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
-               }
-       } else {
-               for (i = 0; i < count; i++) {
-                       pipe_resource_reference(&dst[i].buffer.resource, NULL);
-               }
-               unaligned &= ~updated_mask;
-       }
-       sctx->vertex_buffers_dirty = true;
-       sctx->vertex_buffer_unaligned = (orig_unaligned & ~updated_mask) | unaligned;
-
-       /* Check whether alignment may have changed in a way that requires
-        * shader changes. This check is conservative: a vertex buffer can only
-        * trigger a shader change if the misalignment amount changes (e.g.
-        * from byte-aligned to short-aligned), but we only keep track of
-        * whether buffers are at least dword-aligned, since that should always
-        * be the case in well-behaved applications anyway.
-        */
-       if (sctx->vertex_elements &&
-           (sctx->vertex_elements->vb_alignment_check_mask &
-            (unaligned | orig_unaligned) & updated_mask))
-               sctx->do_update_shaders = true;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
+   unsigned updated_mask = u_bit_consecutive(start_slot, count);
+   uint32_t orig_unaligned = sctx->vertex_buffer_unaligned;
+   uint32_t unaligned = 0;
+   int i;
+
+   assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer));
+
+   if (buffers) {
+      for (i = 0; i < count; i++) {
+         const struct pipe_vertex_buffer *src = buffers + i;
+         struct pipe_vertex_buffer *dsti = dst + i;
+         struct pipe_resource *buf = src->buffer.resource;
+         unsigned slot_bit = 1 << (start_slot + i);
+
+         pipe_resource_reference(&dsti->buffer.resource, buf);
+         dsti->buffer_offset = src->buffer_offset;
+         dsti->stride = src->stride;
+
+         if (dsti->buffer_offset & 3 || dsti->stride & 3)
+            unaligned |= slot_bit;
+
+         si_context_add_resource_size(sctx, buf);
+         if (buf)
+            si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
+      }
+   } else {
+      for (i = 0; i < count; i++) {
+         pipe_resource_reference(&dst[i].buffer.resource, NULL);
+      }
+      unaligned &= ~updated_mask;
+   }
+   sctx->vertex_buffers_dirty = true;
+   sctx->vertex_buffer_unaligned = (orig_unaligned & ~updated_mask) | unaligned;
+
+   /* Check whether alignment may have changed in a way that requires
+    * shader changes. This check is conservative: a vertex buffer can only
+    * trigger a shader change if the misalignment amount changes (e.g.
+    * from byte-aligned to short-aligned), but we only keep track of
+    * whether buffers are at least dword-aligned, since that should always
+    * be the case in well-behaved applications anyway.
+    */
+   if (sctx->vertex_elements && (sctx->vertex_elements->vb_alignment_check_mask &
+                                 (unaligned | orig_unaligned) & updated_mask))
+      sctx->do_update_shaders = true;
  }
  
  /*
   * Misc
   */
  
-static void si_set_tess_state(struct pipe_context *ctx,
-                             const float default_outer_level[4],
-                             const float default_inner_level[2])
+static void si_set_tess_state(struct pipe_context *ctx, const float default_outer_level[4],
+                              const float default_inner_level[2])
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct pipe_constant_buffer cb;
-       float array[8];
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_constant_buffer cb;
+   float array[8];
  
-       memcpy(array, default_outer_level, sizeof(float) * 4);
-       memcpy(array+4, default_inner_level, sizeof(float) * 2);
+   memcpy(array, default_outer_level, sizeof(float) * 4);
+   memcpy(array + 4, default_inner_level, sizeof(float) * 2);
  
-       cb.buffer = NULL;
-       cb.user_buffer = NULL;
-       cb.buffer_size = sizeof(array);
+   cb.buffer = NULL;
+   cb.user_buffer = NULL;
+   cb.buffer_size = sizeof(array);
  
-       si_upload_const_buffer(sctx, (struct si_resource**)&cb.buffer,
-                              (void*)array, sizeof(array),
-                              &cb.buffer_offset);
+   si_upload_const_buffer(sctx, (struct si_resource **)&cb.buffer, (void *)array, sizeof(array),
+                          &cb.buffer_offset);
  
-       si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb);
-       pipe_resource_reference(&cb.buffer, NULL);
+   si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb);
+   pipe_resource_reference(&cb.buffer, NULL);
  }
  
  static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
  
-       si_update_fb_dirtiness_after_rendering(sctx);
+   si_update_fb_dirtiness_after_rendering(sctx);
  
-       /* Multisample surfaces are flushed in si_decompress_textures. */
-       if (sctx->framebuffer.uncompressed_cb_mask) {
-               si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
-                                          sctx->framebuffer.CB_has_shader_readable_metadata,
-                                          sctx->framebuffer.all_DCC_pipe_aligned);
-       }
+   /* Multisample surfaces are flushed in si_decompress_textures. */
+   if (sctx->framebuffer.uncompressed_cb_mask) {
+      si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
+                                 sctx->framebuffer.CB_has_shader_readable_metadata,
+                                 sctx->framebuffer.all_DCC_pipe_aligned);
+   }
  }
  
  /* This only ensures coherency for shader image/buffer stores. */
  static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-
-       if (!(flags & ~PIPE_BARRIER_UPDATE))
-               return;
-
-       /* Subsequent commands must wait for all shader invocations to
-        * complete. */
-       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-                      SI_CONTEXT_CS_PARTIAL_FLUSH;
-
-       if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
-               sctx->flags |= SI_CONTEXT_INV_SCACHE |
-                              SI_CONTEXT_INV_VCACHE;
-
-       if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
-                    PIPE_BARRIER_SHADER_BUFFER |
-                    PIPE_BARRIER_TEXTURE |
-                    PIPE_BARRIER_IMAGE |
-                    PIPE_BARRIER_STREAMOUT_BUFFER |
-                    PIPE_BARRIER_GLOBAL_BUFFER)) {
-               /* As far as I can tell, L1 contents are written back to L2
-                * automatically at end of shader, but the contents of other
-                * L1 caches might still be stale. */
-               sctx->flags |= SI_CONTEXT_INV_VCACHE;
-       }
-
-       if (flags & PIPE_BARRIER_INDEX_BUFFER) {
-               /* Indices are read through TC L2 since GFX8.
-                * L1 isn't used.
-                */
-               if (sctx->screen->info.chip_class <= GFX7)
-                       sctx->flags |= SI_CONTEXT_WB_L2;
-       }
-
-       /* MSAA color, any depth and any stencil are flushed in
-        * si_decompress_textures when needed.
-        */
-       if (flags & PIPE_BARRIER_FRAMEBUFFER &&
-           sctx->framebuffer.uncompressed_cb_mask) {
-               sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
-
-               if (sctx->chip_class <= GFX8)
-                       sctx->flags |= SI_CONTEXT_WB_L2;
-       }
-
-       /* Indirect buffers use TC L2 on GFX9, but not older hw. */
-       if (sctx->screen->info.chip_class <= GFX8 &&
-           flags & PIPE_BARRIER_INDIRECT_BUFFER)
-               sctx->flags |= SI_CONTEXT_WB_L2;
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   if (!(flags & ~PIPE_BARRIER_UPDATE))
+      return;
+
+   /* Subsequent commands must wait for all shader invocations to
+    * complete. */
+   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+   if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
+      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
+
+   if (flags & (PIPE_BARRIER_VERTEX_BUFFER | PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_TEXTURE |
+                PIPE_BARRIER_IMAGE | PIPE_BARRIER_STREAMOUT_BUFFER | PIPE_BARRIER_GLOBAL_BUFFER)) {
+      /* As far as I can tell, L1 contents are written back to L2
+       * automatically at end of shader, but the contents of other
+       * L1 caches might still be stale. */
+      sctx->flags |= SI_CONTEXT_INV_VCACHE;
+   }
+
+   if (flags & PIPE_BARRIER_INDEX_BUFFER) {
+      /* Indices are read through TC L2 since GFX8.
+       * L1 isn't used.
+       */
+      if (sctx->screen->info.chip_class <= GFX7)
+         sctx->flags |= SI_CONTEXT_WB_L2;
+   }
+
+   /* MSAA color, any depth and any stencil are flushed in
+    * si_decompress_textures when needed.
+    */
+   if (flags & PIPE_BARRIER_FRAMEBUFFER && sctx->framebuffer.uncompressed_cb_mask) {
+      sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
+      if (sctx->chip_class <= GFX8)
+         sctx->flags |= SI_CONTEXT_WB_L2;
+   }
+
+   /* Indirect buffers use TC L2 on GFX9, but not older hw. */
+   if (sctx->screen->info.chip_class <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER)
+      sctx->flags |= SI_CONTEXT_WB_L2;
  }
  
  static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
  {
-       struct pipe_blend_state blend;
+   struct pipe_blend_state blend;
  
-       memset(&blend, 0, sizeof(blend));
-       blend.independent_blend_enable = true;
-       blend.rt[0].colormask = 0xf;
-       return si_create_blend_state_mode(&sctx->b, &blend, mode);
+   memset(&blend, 0, sizeof(blend));
+   blend.independent_blend_enable = true;
+   blend.rt[0].colormask = 0xf;
+   return si_create_blend_state_mode(&sctx->b, &blend, mode);
  }
  
  static void si_init_config(struct si_context *sctx);
  
  void si_init_state_compute_functions(struct si_context *sctx)
  {
-       sctx->b.create_sampler_state = si_create_sampler_state;
-       sctx->b.delete_sampler_state = si_delete_sampler_state;
-       sctx->b.create_sampler_view = si_create_sampler_view;
-       sctx->b.sampler_view_destroy = si_sampler_view_destroy;
-       sctx->b.memory_barrier = si_memory_barrier;
+   sctx->b.create_sampler_state = si_create_sampler_state;
+   sctx->b.delete_sampler_state = si_delete_sampler_state;
+   sctx->b.create_sampler_view = si_create_sampler_view;
+   sctx->b.sampler_view_destroy = si_sampler_view_destroy;
+   sctx->b.memory_barrier = si_memory_barrier;
  }
  
  void si_init_state_functions(struct si_context *sctx)
  {
-       sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state;
-       sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs;
-       sctx->atoms.s.db_render_state.emit = si_emit_db_render_state;
-       sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state;
-       sctx->atoms.s.msaa_config.emit = si_emit_msaa_config;
-       sctx->atoms.s.sample_mask.emit = si_emit_sample_mask;
-       sctx->atoms.s.cb_render_state.emit = si_emit_cb_render_state;
-       sctx->atoms.s.blend_color.emit = si_emit_blend_color;
-       sctx->atoms.s.clip_regs.emit = si_emit_clip_regs;
-       sctx->atoms.s.clip_state.emit = si_emit_clip_state;
-       sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref;
-
-       sctx->b.create_blend_state = si_create_blend_state;
-       sctx->b.bind_blend_state = si_bind_blend_state;
-       sctx->b.delete_blend_state = si_delete_blend_state;
-       sctx->b.set_blend_color = si_set_blend_color;
-
-       sctx->b.create_rasterizer_state = si_create_rs_state;
-       sctx->b.bind_rasterizer_state = si_bind_rs_state;
-       sctx->b.delete_rasterizer_state = si_delete_rs_state;
-
-       sctx->b.create_depth_stencil_alpha_state = si_create_dsa_state;
-       sctx->b.bind_depth_stencil_alpha_state = si_bind_dsa_state;
-       sctx->b.delete_depth_stencil_alpha_state = si_delete_dsa_state;
-
-       sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx);
-       sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE);
-       sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS);
-       sctx->custom_blend_eliminate_fastclear = si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR);
-       sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS);
-
-       sctx->b.set_clip_state = si_set_clip_state;
-       sctx->b.set_stencil_ref = si_set_stencil_ref;
-
-       sctx->b.set_framebuffer_state = si_set_framebuffer_state;
-
-       sctx->b.set_sample_mask = si_set_sample_mask;
-
-       sctx->b.create_vertex_elements_state = si_create_vertex_elements;
-       sctx->b.bind_vertex_elements_state = si_bind_vertex_elements;
-       sctx->b.delete_vertex_elements_state = si_delete_vertex_element;
-       sctx->b.set_vertex_buffers = si_set_vertex_buffers;
-
-       sctx->b.texture_barrier = si_texture_barrier;
-       sctx->b.set_min_samples = si_set_min_samples;
-       sctx->b.set_tess_state = si_set_tess_state;
-
-       sctx->b.set_active_query_state = si_set_active_query_state;
-
-       si_init_config(sctx);
+   sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state;
+   sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs;
+   sctx->atoms.s.db_render_state.emit = si_emit_db_render_state;
+   sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state;
+   sctx->atoms.s.msaa_config.emit = si_emit_msaa_config;
+   sctx->atoms.s.sample_mask.emit = si_emit_sample_mask;
+   sctx->atoms.s.cb_render_state.emit = si_emit_cb_render_state;
+   sctx->atoms.s.blend_color.emit = si_emit_blend_color;
+   sctx->atoms.s.clip_regs.emit = si_emit_clip_regs;
+   sctx->atoms.s.clip_state.emit = si_emit_clip_state;
+   sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref;
+
+   sctx->b.create_blend_state = si_create_blend_state;
+   sctx->b.bind_blend_state = si_bind_blend_state;
+   sctx->b.delete_blend_state = si_delete_blend_state;
+   sctx->b.set_blend_color = si_set_blend_color;
+
+   sctx->b.create_rasterizer_state = si_create_rs_state;
+   sctx->b.bind_rasterizer_state = si_bind_rs_state;
+   sctx->b.delete_rasterizer_state = si_delete_rs_state;
+
+   sctx->b.create_depth_stencil_alpha_state = si_create_dsa_state;
+   sctx->b.bind_depth_stencil_alpha_state = si_bind_dsa_state;
+   sctx->b.delete_depth_stencil_alpha_state = si_delete_dsa_state;
+
+   sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx);
+   sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE);
+   sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS);
+   sctx->custom_blend_eliminate_fastclear =
+      si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR);
+   sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS);
+
+   sctx->b.set_clip_state = si_set_clip_state;
+   sctx->b.set_stencil_ref = si_set_stencil_ref;
+
+   sctx->b.set_framebuffer_state = si_set_framebuffer_state;
+
+   sctx->b.set_sample_mask = si_set_sample_mask;
+
+   sctx->b.create_vertex_elements_state = si_create_vertex_elements;
+   sctx->b.bind_vertex_elements_state = si_bind_vertex_elements;
+   sctx->b.delete_vertex_elements_state = si_delete_vertex_element;
+   sctx->b.set_vertex_buffers = si_set_vertex_buffers;
+
+   sctx->b.texture_barrier = si_texture_barrier;
+   sctx->b.set_min_samples = si_set_min_samples;
+   sctx->b.set_tess_state = si_set_tess_state;
+
+   sctx->b.set_active_query_state = si_set_active_query_state;
+
+   si_init_config(sctx);
  }
  
  void si_init_screen_state_functions(struct si_screen *sscreen)
  {
-       sscreen->b.is_format_supported = si_is_format_supported;
+   sscreen->b.is_format_supported = si_is_format_supported;
  
-       if (sscreen->info.chip_class >= GFX10) {
-               sscreen->make_texture_descriptor = gfx10_make_texture_descriptor;
-       } else {
-               sscreen->make_texture_descriptor = si_make_texture_descriptor;
-       }
+   if (sscreen->info.chip_class >= GFX10) {
+      sscreen->make_texture_descriptor = gfx10_make_texture_descriptor;
+   } else {
+      sscreen->make_texture_descriptor = si_make_texture_descriptor;
+   }
  }
  
-static void si_set_grbm_gfx_index(struct si_context *sctx,
-                                 struct si_pm4_state *pm4,  unsigned value)
+static void si_set_grbm_gfx_index(struct si_context *sctx, struct si_pm4_state *pm4, unsigned value)
  {
-       unsigned reg = sctx->chip_class >= GFX7 ? R_030800_GRBM_GFX_INDEX :
-                                                  R_00802C_GRBM_GFX_INDEX;
-       si_pm4_set_reg(pm4, reg, value);
+   unsigned reg = sctx->chip_class >= GFX7 ? R_030800_GRBM_GFX_INDEX : R_00802C_GRBM_GFX_INDEX;
+   si_pm4_set_reg(pm4, reg, value);
  }
  
-static void si_set_grbm_gfx_index_se(struct si_context *sctx,
-                                    struct si_pm4_state *pm4, unsigned se)
+static void si_set_grbm_gfx_index_se(struct si_context *sctx, struct si_pm4_state *pm4, unsigned se)
  {
-       assert(se == ~0 || se < sctx->screen->info.max_se);
-       si_set_grbm_gfx_index(sctx, pm4,
-                             (se == ~0 ? S_030800_SE_BROADCAST_WRITES(1) :
-                                         S_030800_SE_INDEX(se)) |
-                             S_030800_SH_BROADCAST_WRITES(1) |
-                             S_030800_INSTANCE_BROADCAST_WRITES(1));
+   assert(se == ~0 || se < sctx->screen->info.max_se);
+   si_set_grbm_gfx_index(sctx, pm4,
+                         (se == ~0 ? S_030800_SE_BROADCAST_WRITES(1) : S_030800_SE_INDEX(se)) |
+                            S_030800_SH_BROADCAST_WRITES(1) |
+                            S_030800_INSTANCE_BROADCAST_WRITES(1));
  }
  
-static void
-si_write_harvested_raster_configs(struct si_context *sctx,
-                                 struct si_pm4_state *pm4,
-                                 unsigned raster_config,
-                                 unsigned raster_config_1)
+static void si_write_harvested_raster_configs(struct si_context *sctx, struct si_pm4_state *pm4,
+                                              unsigned raster_config, unsigned raster_config_1)
  {
-       unsigned num_se = MAX2(sctx->screen->info.max_se, 1);
-       unsigned raster_config_se[4];
-       unsigned se;
-
-       ac_get_harvested_configs(&sctx->screen->info,
-                                raster_config,
-                                &raster_config_1,
-                                raster_config_se);
-
-       for (se = 0; se < num_se; se++) {
-               si_set_grbm_gfx_index_se(sctx, pm4, se);
-               si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]);
-       }
-       si_set_grbm_gfx_index(sctx, pm4, ~0);
-
-       if (sctx->chip_class >= GFX7) {
-               si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
-       }
+   unsigned num_se = MAX2(sctx->screen->info.max_se, 1);
+   unsigned raster_config_se[4];
+   unsigned se;
+
+   ac_get_harvested_configs(&sctx->screen->info, raster_config, &raster_config_1, raster_config_se);
+
+   for (se = 0; se < num_se; se++) {
+      si_set_grbm_gfx_index_se(sctx, pm4, se);
+      si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]);
+   }
+   si_set_grbm_gfx_index(sctx, pm4, ~0);
+
+   if (sctx->chip_class >= GFX7) {
+      si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
+   }
  }
  
  static void si_set_raster_config(struct si_context *sctx, struct si_pm4_state *pm4)
  {
-       struct si_screen *sscreen = sctx->screen;
-       unsigned num_rb = MIN2(sscreen->info.num_render_backends, 16);
-       unsigned rb_mask = sscreen->info.enabled_rb_mask;
-       unsigned raster_config = sscreen->pa_sc_raster_config;
-       unsigned raster_config_1 = sscreen->pa_sc_raster_config_1;
-
-       if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
-               /* Always use the default config when all backends are enabled
-                * (or when we failed to determine the enabled backends).
-                */
-               si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG,
-                              raster_config);
-               if (sctx->chip_class >= GFX7)
-                       si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1,
-                                      raster_config_1);
-       } else {
-               si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1);
-       }
+   struct si_screen *sscreen = sctx->screen;
+   unsigned num_rb = MIN2(sscreen->info.num_render_backends, 16);
+   unsigned rb_mask = sscreen->info.enabled_rb_mask;
+   unsigned raster_config = sscreen->pa_sc_raster_config;
+   unsigned raster_config_1 = sscreen->pa_sc_raster_config_1;
+
+   if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
+      /* Always use the default config when all backends are enabled
+       * (or when we failed to determine the enabled backends).
+       */
+      si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config);
+      if (sctx->chip_class >= GFX7)
+         si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
+   } else {
+      si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1);
+   }
  }
  
  static void si_init_config(struct si_context *sctx)
  {
-       struct si_screen *sscreen = sctx->screen;
-       uint64_t border_color_va = sctx->border_color_buffer->gpu_address;
-       bool has_clear_state = sscreen->info.has_clear_state;
-       struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
-
-       if (!pm4)
-               return;
-
-       si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL);
-       si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1));
-       si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1));
-       si_pm4_cmd_end(pm4, false);
-
-       if (has_clear_state) {
-               si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE);
-               si_pm4_cmd_add(pm4, 0);
-               si_pm4_cmd_end(pm4, false);
-       }
-
-       if (sctx->chip_class <= GFX8)
-               si_set_raster_config(sctx, pm4);
-
-       si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
-       if (!has_clear_state)
-               si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
-
-       /* FIXME calculate these values somehow ??? */
-       if (sctx->chip_class <= GFX8) {
-               si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
-               si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
-       }
-
-       if (!has_clear_state) {
-               si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
-               si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
-               si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
-       }
-
-       if (sscreen->info.chip_class <= GFX9)
-               si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
-       if (!has_clear_state)
-               si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
-       if (sctx->chip_class < GFX7)
-               si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) |
-                              S_008A14_CLIP_VTX_REORDER_ENA(1));
-
-       /* CLEAR_STATE doesn't restore these correctly. */
-       si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1));
-       si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR,
-                      S_028244_BR_X(16384) | S_028244_BR_Y(16384));
-
-       /* CLEAR_STATE doesn't clear these correctly on certain generations.
-        * I don't know why. Deduced by trial and error.
-        */
-       if (sctx->chip_class <= GFX7 || !has_clear_state) {
-               si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
-               si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
-               si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0);
-               si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR,
-                              S_028034_BR_X(16384) | S_028034_BR_Y(16384));
-       }
-
-       if (!has_clear_state) {
-               si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE,
-                              S_028230_ER_TRI(0xA) |
-                              S_028230_ER_POINT(0xA) |
-                              S_028230_ER_RECT(0xA) |
-                              /* Required by DX10_DIAMOND_TEST_ENA: */
-                              S_028230_ER_LINE_LR(0x1A) |
-                              S_028230_ER_LINE_RL(0x26) |
-                              S_028230_ER_LINE_TB(0xA) |
-                              S_028230_ER_LINE_BT(0xA));
-               si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0);
-               si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
-               si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
-               si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
-               si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0);
-       }
-
-       if (sctx->chip_class >= GFX10) {
-               si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0);
-               si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0);
-               si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0);
-               si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0);
-               si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0);
-               si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0);
-       } else if (sctx->chip_class == GFX9) {
-               si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
-               si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
-               si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
-       } else {
-               /* These registers, when written, also overwrite the CLEAR_STATE
-                * context, so we can't rely on CLEAR_STATE setting them.
-                * It would be an issue if there was another UMD changing them.
-                */
-               si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0);
-               si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0);
-               si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
-       }
-
-       if (sctx->chip_class >= GFX7) {
-               if (sctx->chip_class >= GFX10) {
-                       /* Logical CUs 16 - 31 */
-                       si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
-                                      S_00B404_CU_EN(0xffff));
-                       si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS,
-                                      S_00B104_CU_EN(0xffff));
-                       si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
-                                      S_00B004_CU_EN(0xffff));
-               }
-
-               if (sctx->chip_class >= GFX9) {
-                       si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
-                                      S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
-               } else {
-                       si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
-                                      S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
-                       si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
-                                      S_00B41C_WAVE_LIMIT(0x3F));
-                       si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
-                                      S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
-
-                       /* If this is 0, Bonaire can hang even if GS isn't being used.
-                        * Other chips are unaffected. These are suboptimal values,
-                        * but we don't use on-chip GS.
-                        */
-                       si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
-                                      S_028A44_ES_VERTS_PER_SUBGRP(64) |
-                                      S_028A44_GS_PRIMS_PER_SUBGRP(4));
-               }
-
-               /* Compute LATE_ALLOC_VS.LIMIT. */
-               unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
-               unsigned late_alloc_wave64 = 0; /* The limit is per SH. */
-               unsigned cu_mask_vs = 0xffff;
-               unsigned cu_mask_gs = 0xffff;
-
-               if (sctx->chip_class >= GFX10) {
-                       /* For Wave32, the hw will launch twice the number of late
-                        * alloc waves, so 1 == 2x wave32.
-                        */
-                       if (!sscreen->info.use_late_alloc) {
-                               late_alloc_wave64 = 0;
-                       } else if (num_cu_per_sh <= 6) {
-                               late_alloc_wave64 = num_cu_per_sh - 2;
-                       } else {
-                               late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-
-                               /* CU2 & CU3 disabled because of the dual CU design */
-                               /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
-                               cu_mask_vs = 0xfff3;
-                               cu_mask_gs = sscreen->use_ngg &&
-                                            sctx->family != CHIP_NAVI14 ? 0xfff3 : 0xffff;
-                       }
-               } else {
-                       if (!sscreen->info.use_late_alloc) {
-                               late_alloc_wave64 = 0;
-                       } else if (num_cu_per_sh <= 4) {
-                               /* Too few available compute units per SH. Disallowing
-                                * VS to run on one CU could hurt us more than late VS
-                                * allocation would help.
-                                *
-                                * 2 is the highest safe number that allows us to keep
-                                * all CUs enabled.
-                                */
-                               late_alloc_wave64 = 2;
-                       } else {
-                               /* This is a good initial value, allowing 1 late_alloc
-                                * wave per SIMD on num_cu - 2.
-                                */
-                               late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-                       }
-
-                       if (late_alloc_wave64 > 2)
-                               cu_mask_vs = 0xfffe; /* 1 CU disabled */
-               }
-
-               /* VS can't execute on one CU if the limit is > 2. */
-               si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
-                       S_00B118_CU_EN(cu_mask_vs) |
-                       S_00B118_WAVE_LIMIT(0x3F));
-               si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS,
-                       S_00B11C_LIMIT(late_alloc_wave64));
-
-               si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
-                              S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
-
-               si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
-                              S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
-       }
-
-       if (sctx->chip_class >= GFX10) {
-               /* Break up a pixel wave if it contains deallocs for more than
-                * half the parameter cache.
-                *
-                * To avoid a deadlock where pixel waves aren't launched
-                * because they're waiting for more pixels while the frontend
-                * is stuck waiting for PC space, the maximum allowed value is
-                * the size of the PC minus the largest possible allocation for
-                * a single primitive shader subgroup.
-                */
-               si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL,
-                              S_028C50_MAX_DEALLOCS_IN_WAVE(512));
-               si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
-
-               if (!has_clear_state) {
-                       si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
-                                      sscreen->info.pa_sc_tile_steering_override);
-               }
-
-               /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */
-               unsigned meta_write_policy, meta_read_policy;
-               /* TODO: investigate whether LRU improves performance on other chips too */
-               if (sscreen->info.num_render_backends <= 4) {
-                       meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */
-                       meta_read_policy =  V_02807C_CACHE_LRU_RD; /* cache reads */
-               } else {
-                       meta_write_policy = V_02807C_CACHE_STREAM_WR; /* write combine */
-                       meta_read_policy =  V_02807C_CACHE_NOA_RD;    /* don't cache reads */
-               }
-
-               si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL,
-                              S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
-                              S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
-                              S_02807C_HTILE_WR_POLICY(meta_write_policy) |
-                              S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
-                              S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) |
-                              S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) |
-                              S_02807C_HTILE_RD_POLICY(meta_read_policy));
-
-               si_pm4_set_reg(pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL,
-                              S_028410_CMASK_WR_POLICY(meta_write_policy) |
-                              S_028410_FMASK_WR_POLICY(meta_write_policy) |
-                              S_028410_DCC_WR_POLICY(meta_write_policy) |
-                              S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) |
-                              S_028410_CMASK_RD_POLICY(meta_read_policy) |
-                              S_028410_FMASK_RD_POLICY(meta_read_policy) |
-                              S_028410_DCC_RD_POLICY(meta_read_policy) |
-                              S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD));
-               si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0);
-
-               si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
-                              S_00B0C0_SOFT_GROUPING_EN(1) |
-                              S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
-               si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
-       }
-
-       if (sctx->chip_class >= GFX9) {
-               si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,
-                              S_028B50_ACCUM_ISOLINE(40) |
-                              S_028B50_ACCUM_TRI(30) |
-                              S_028B50_ACCUM_QUAD(24) |
-                              S_028B50_DONUT_SPLIT(24) |
-                              S_028B50_TRAP_SPLIT(6));
-       } else if (sctx->chip_class >= GFX8) {
-               unsigned vgt_tess_distribution;
-
-               vgt_tess_distribution =
-                       S_028B50_ACCUM_ISOLINE(32) |
-                       S_028B50_ACCUM_TRI(11) |
-                       S_028B50_ACCUM_QUAD(11) |
-                       S_028B50_DONUT_SPLIT(16);
-
-               /* Testing with Unigine Heaven extreme tesselation yielded best results
-                * with TRAP_SPLIT = 3.
-                */
-               if (sctx->family == CHIP_FIJI ||
-                   sctx->family >= CHIP_POLARIS10)
-                       vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3);
-
-               si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution);
-       } else if (!has_clear_state) {
-               si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
-               si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
-       }
-
-       si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
-       if (sctx->chip_class >= GFX7) {
-               si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI,
-                              S_028084_ADDRESS(border_color_va >> 40));
-       }
-       si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ,
-                     RADEON_PRIO_BORDER_COLORS);
-
-       if (sctx->chip_class >= GFX9) {
-               si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1,
-                              S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) |
-                              S_028C48_MAX_PRIM_PER_BATCH(1023));
-               si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
-                              S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1));
-               si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0);
-       }
-
-       si_pm4_upload_indirect_buffer(sctx, pm4);
-       sctx->init_config = pm4;
+   struct si_screen *sscreen = sctx->screen;
+   uint64_t border_color_va = sctx->border_color_buffer->gpu_address;
+   bool has_clear_state = sscreen->info.has_clear_state;
+   struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
+
+   if (!pm4)
+      return;
+
+   si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL);
+   si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1));
+   si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1));
+   si_pm4_cmd_end(pm4, false);
+
+   if (has_clear_state) {
+      si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE);
+      si_pm4_cmd_add(pm4, 0);
+      si_pm4_cmd_end(pm4, false);
+   }
+
+   if (sctx->chip_class <= GFX8)
+      si_set_raster_config(sctx, pm4);
+
+   si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
+   if (!has_clear_state)
+      si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
+
+   /* FIXME calculate these values somehow ??? */
+   if (sctx->chip_class <= GFX8) {
+      si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
+      si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
+   }
+
+   if (!has_clear_state) {
+      si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
+      si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
+      si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
+   }
+
+   if (sscreen->info.chip_class <= GFX9)
+      si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
+   if (!has_clear_state)
+      si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
+   if (sctx->chip_class < GFX7)
+      si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE,
+                     S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1));
+
+   /* CLEAR_STATE doesn't restore these correctly. */
+   si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1));
+   si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR,
+                  S_028244_BR_X(16384) | S_028244_BR_Y(16384));
+
+   /* CLEAR_STATE doesn't clear these correctly on certain generations.
+    * I don't know why. Deduced by trial and error.
+    */
+   if (sctx->chip_class <= GFX7 || !has_clear_state) {
+      si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
+      si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
+      si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0);
+      si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR,
+                     S_028034_BR_X(16384) | S_028034_BR_Y(16384));
+   }
+
+   if (!has_clear_state) {
+      si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE,
+                     S_028230_ER_TRI(0xA) | S_028230_ER_POINT(0xA) | S_028230_ER_RECT(0xA) |
+                        /* Required by DX10_DIAMOND_TEST_ENA: */
+                        S_028230_ER_LINE_LR(0x1A) | S_028230_ER_LINE_RL(0x26) |
+                        S_028230_ER_LINE_TB(0xA) | S_028230_ER_LINE_BT(0xA));
+      si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0);
+      si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
+      si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
+      si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
+      si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0);
+   }
+
+   if (sctx->chip_class >= GFX10) {
+      si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0);
+      si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0);
+      si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0);
+      si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0);
+      si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0);
+      si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0);
+   } else if (sctx->chip_class == GFX9) {
+      si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
+      si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
+      si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
+   } else {
+      /* These registers, when written, also overwrite the CLEAR_STATE
+       * context, so we can't rely on CLEAR_STATE setting them.
+       * It would be an issue if there was another UMD changing them.
+       */
+      si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0);
+      si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0);
+      si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
+   }
+
+   if (sctx->chip_class >= GFX7) {
+      if (sctx->chip_class >= GFX10) {
+         /* Logical CUs 16 - 31 */
+         si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff));
+         si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff));
+         si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(0xffff));
+      }
+
+      if (sctx->chip_class >= GFX9) {
+         si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
+                        S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
+      } else {
+         si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
+                        S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
+         si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F));
+         si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
+                        S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
+
+         /* If this is 0, Bonaire can hang even if GS isn't being used.
+          * Other chips are unaffected. These are suboptimal values,
+          * but we don't use on-chip GS.
+          */
+         si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
+                        S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4));
+      }
+
+      /* Compute LATE_ALLOC_VS.LIMIT. */
+      unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
+      unsigned late_alloc_wave64 = 0; /* The limit is per SH. */
+      unsigned cu_mask_vs = 0xffff;
+      unsigned cu_mask_gs = 0xffff;
+
+      if (sctx->chip_class >= GFX10) {
+         /* For Wave32, the hw will launch twice the number of late
+          * alloc waves, so 1 == 2x wave32.
+          */
+         if (!sscreen->info.use_late_alloc) {
+            late_alloc_wave64 = 0;
+         } else if (num_cu_per_sh <= 6) {
+            late_alloc_wave64 = num_cu_per_sh - 2;
+         } else {
+            late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+
+            /* CU2 & CU3 disabled because of the dual CU design */
+            /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
+            cu_mask_vs = 0xfff3;
+            cu_mask_gs = sscreen->use_ngg && sctx->family != CHIP_NAVI14 ? 0xfff3 : 0xffff;
+         }
+      } else {
+         if (!sscreen->info.use_late_alloc) {
+            late_alloc_wave64 = 0;
+         } else if (num_cu_per_sh <= 4) {
+            /* Too few available compute units per SH. Disallowing
+             * VS to run on one CU could hurt us more than late VS
+             * allocation would help.
+             *
+             * 2 is the highest safe number that allows us to keep
+             * all CUs enabled.
+             */
+            late_alloc_wave64 = 2;
+         } else {
+            /* This is a good initial value, allowing 1 late_alloc
+             * wave per SIMD on num_cu - 2.
+             */
+            late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+         }
+
+         if (late_alloc_wave64 > 2)
+            cu_mask_vs = 0xfffe; /* 1 CU disabled */
+      }
+
+      /* VS can't execute on one CU if the limit is > 2. */
+      si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
+                     S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F));
+      si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
+
+      si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                     S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
+
+      si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
+                     S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
+   }
+
+   if (sctx->chip_class >= GFX10) {
+      /* Break up a pixel wave if it contains deallocs for more than
+       * half the parameter cache.
+       *
+       * To avoid a deadlock where pixel waves aren't launched
+       * because they're waiting for more pixels while the frontend
+       * is stuck waiting for PC space, the maximum allowed value is
+       * the size of the PC minus the largest possible allocation for
+       * a single primitive shader subgroup.
+       */
+      si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512));
+      si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
+
+      if (!has_clear_state) {
+         si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
+                        sscreen->info.pa_sc_tile_steering_override);
+      }
+
+      /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */
+      unsigned meta_write_policy, meta_read_policy;
+      /* TODO: investigate whether LRU improves performance on other chips too */
+      if (sscreen->info.num_render_backends <= 4) {
+         meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */
+         meta_read_policy = V_02807C_CACHE_LRU_RD;  /* cache reads */
+      } else {
+         meta_write_policy = V_02807C_CACHE_STREAM_WR; /* write combine */
+         meta_read_policy = V_02807C_CACHE_NOA_RD;     /* don't cache reads */
+      }
+
+      si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL,
+                     S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
+                        S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
+                        S_02807C_HTILE_WR_POLICY(meta_write_policy) |
+                        S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
+                        S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) |
+                        S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) |
+                        S_02807C_HTILE_RD_POLICY(meta_read_policy));
+
+      si_pm4_set_reg(
+         pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL,
+         S_028410_CMASK_WR_POLICY(meta_write_policy) | S_028410_FMASK_WR_POLICY(meta_write_policy) |
+            S_028410_DCC_WR_POLICY(meta_write_policy) |
+            S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) |
+            S_028410_CMASK_RD_POLICY(meta_read_policy) |
+            S_028410_FMASK_RD_POLICY(meta_read_policy) | S_028410_DCC_RD_POLICY(meta_read_policy) |
+            S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD));
+      si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0);
+
+      si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
+                     S_00B0C0_SOFT_GROUPING_EN(1) | S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
+      si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
+   }
+
+   if (sctx->chip_class >= GFX9) {
+      si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,
+                     S_028B50_ACCUM_ISOLINE(40) | S_028B50_ACCUM_TRI(30) | S_028B50_ACCUM_QUAD(24) |
+                        S_028B50_DONUT_SPLIT(24) | S_028B50_TRAP_SPLIT(6));
+   } else if (sctx->chip_class >= GFX8) {
+      unsigned vgt_tess_distribution;
+
+      vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) | S_028B50_ACCUM_TRI(11) |
+                              S_028B50_ACCUM_QUAD(11) | S_028B50_DONUT_SPLIT(16);
+
+      /* Testing with Unigine Heaven extreme tesselation yielded best results
+       * with TRAP_SPLIT = 3.
+       */
+      if (sctx->family == CHIP_FIJI || sctx->family >= CHIP_POLARIS10)
+         vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3);
+
+      si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution);
+   } else if (!has_clear_state) {
+      si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
+      si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
+   }
+
+   si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
+   if (sctx->chip_class >= GFX7) {
+      si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, S_028084_ADDRESS(border_color_va >> 40));
+   }
+   si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ, RADEON_PRIO_BORDER_COLORS);
+
+   if (sctx->chip_class >= GFX9) {
+      si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1,
+                     S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) |
+                        S_028C48_MAX_PRIM_PER_BATCH(1023));
+      si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
+                     S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1));
+      si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0);
+   }
+
+   si_pm4_upload_indirect_buffer(sctx, pm4);
+   sctx->init_config = pm4;
  }
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h

index 824bf4fef41bab7fe09a7dc06437ef0c325de58a..aa024b72e4311d1c492e333115c034ddb16baaf2 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -25,20 +25,19 @@
  #ifndef SI_STATE_H
  #define SI_STATE_H
  
-#include "si_pm4.h"
-
  #include "pipebuffer/pb_slab.h"
+#include "si_pm4.h"
  #include "util/u_blitter.h"
  
-#define SI_NUM_GRAPHICS_SHADERS (PIPE_SHADER_TESS_EVAL+1)
-#define SI_NUM_SHADERS (PIPE_SHADER_COMPUTE+1)
+#define SI_NUM_GRAPHICS_SHADERS (PIPE_SHADER_TESS_EVAL + 1)
+#define SI_NUM_SHADERS          (PIPE_SHADER_COMPUTE + 1)
  
-#define SI_NUM_VERTEX_BUFFERS          SI_MAX_ATTRIBS
-#define SI_NUM_SAMPLERS                        32 /* OpenGL textures units per shader */
-#define SI_NUM_CONST_BUFFERS           16
-#define SI_NUM_IMAGES                  16
-#define SI_NUM_IMAGE_SLOTS             (SI_NUM_IMAGES * 2) /* the second half are FMASK slots */
-#define SI_NUM_SHADER_BUFFERS          16
+#define SI_NUM_VERTEX_BUFFERS SI_MAX_ATTRIBS
+#define SI_NUM_SAMPLERS       32 /* OpenGL textures units per shader */
+#define SI_NUM_CONST_BUFFERS  16
+#define SI_NUM_IMAGES         16
+#define SI_NUM_IMAGE_SLOTS    (SI_NUM_IMAGES * 2) /* the second half are FMASK slots */
+#define SI_NUM_SHADER_BUFFERS 16
  
  struct si_screen;
  struct si_shader;
@@ -48,351 +47,335 @@ struct si_texture;
  struct si_qbo_state;
  
  struct si_state_blend {
-       struct si_pm4_state     pm4;
-       uint32_t                cb_target_mask;
-       /* Set 0xf or 0x0 (4 bits) per render target if the following is
-        * true. ANDed with spi_shader_col_format.
-        */
-       unsigned                cb_target_enabled_4bit;
-       unsigned                blend_enable_4bit;
-       unsigned                need_src_alpha_4bit;
-       unsigned                commutative_4bit;
-       unsigned                dcc_msaa_corruption_4bit;
-       bool                    alpha_to_coverage:1;
-       bool                    alpha_to_one:1;
-       bool                    dual_src_blend:1;
-       bool                    logicop_enable:1;
+   struct si_pm4_state pm4;
+   uint32_t cb_target_mask;
+   /* Set 0xf or 0x0 (4 bits) per render target if the following is
+    * true. ANDed with spi_shader_col_format.
+    */
+   unsigned cb_target_enabled_4bit;
+   unsigned blend_enable_4bit;
+   unsigned need_src_alpha_4bit;
+   unsigned commutative_4bit;
+   unsigned dcc_msaa_corruption_4bit;
+   bool alpha_to_coverage : 1;
+   bool alpha_to_one : 1;
+   bool dual_src_blend : 1;
+   bool logicop_enable : 1;
  };
  
  struct si_state_rasterizer {
-       struct si_pm4_state     pm4;
-       /* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */
-       struct si_pm4_state     *pm4_poly_offset;
-       unsigned                pa_sc_line_stipple;
-       unsigned                pa_cl_clip_cntl;
-       float                   line_width;
-       float                   max_point_size;
-       unsigned                sprite_coord_enable:8;
-       unsigned                clip_plane_enable:8;
-       unsigned                half_pixel_center:1;
-       unsigned                flatshade:1;
-       unsigned                flatshade_first:1;
-       unsigned                two_side:1;
-       unsigned                multisample_enable:1;
-       unsigned                force_persample_interp:1;
-       unsigned                line_stipple_enable:1;
-       unsigned                poly_stipple_enable:1;
-       unsigned                line_smooth:1;
-       unsigned                poly_smooth:1;
-       unsigned                uses_poly_offset:1;
-       unsigned                clamp_fragment_color:1;
-       unsigned                clamp_vertex_color:1;
-       unsigned                rasterizer_discard:1;
-       unsigned                scissor_enable:1;
-       unsigned                clip_halfz:1;
-       unsigned                cull_front:1;
-       unsigned                cull_back:1;
-       unsigned                depth_clamp_any:1;
-       unsigned                provoking_vertex_first:1;
-       unsigned                polygon_mode_enabled:1;
-       unsigned                polygon_mode_is_lines:1;
+   struct si_pm4_state pm4;
+   /* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */
+   struct si_pm4_state *pm4_poly_offset;
+   unsigned pa_sc_line_stipple;
+   unsigned pa_cl_clip_cntl;
+   float line_width;
+   float max_point_size;
+   unsigned sprite_coord_enable : 8;
+   unsigned clip_plane_enable : 8;
+   unsigned half_pixel_center : 1;
+   unsigned flatshade : 1;
+   unsigned flatshade_first : 1;
+   unsigned two_side : 1;
+   unsigned multisample_enable : 1;
+   unsigned force_persample_interp : 1;
+   unsigned line_stipple_enable : 1;
+   unsigned poly_stipple_enable : 1;
+   unsigned line_smooth : 1;
+   unsigned poly_smooth : 1;
+   unsigned uses_poly_offset : 1;
+   unsigned clamp_fragment_color : 1;
+   unsigned clamp_vertex_color : 1;
+   unsigned rasterizer_discard : 1;
+   unsigned scissor_enable : 1;
+   unsigned clip_halfz : 1;
+   unsigned cull_front : 1;
+   unsigned cull_back : 1;
+   unsigned depth_clamp_any : 1;
+   unsigned provoking_vertex_first : 1;
+   unsigned polygon_mode_enabled : 1;
+   unsigned polygon_mode_is_lines : 1;
  };
  
  struct si_dsa_stencil_ref_part {
-       uint8_t                 valuemask[2];
-       uint8_t                 writemask[2];
+   uint8_t valuemask[2];
+   uint8_t writemask[2];
  };
  
  struct si_dsa_order_invariance {
-       /** Whether the final result in Z/S buffers is guaranteed to be
-        * invariant under changes to the order in which fragments arrive. */
-       bool zs:1;
-
-       /** Whether the set of fragments that pass the combined Z/S test is
-        * guaranteed to be invariant under changes to the order in which
-        * fragments arrive. */
-       bool pass_set:1;
-
-       /** Whether the last fragment that passes the combined Z/S test at each
-        * sample is guaranteed to be invariant under changes to the order in
-        * which fragments arrive. */
-       bool pass_last:1;
+   /** Whether the final result in Z/S buffers is guaranteed to be
+    * invariant under changes to the order in which fragments arrive. */
+   bool zs : 1;
+
+   /** Whether the set of fragments that pass the combined Z/S test is
+    * guaranteed to be invariant under changes to the order in which
+    * fragments arrive. */
+   bool pass_set : 1;
+
+   /** Whether the last fragment that passes the combined Z/S test at each
+    * sample is guaranteed to be invariant under changes to the order in
+    * which fragments arrive. */
+   bool pass_last : 1;
  };
  
  struct si_state_dsa {
-       struct si_pm4_state             pm4;
-       struct si_dsa_stencil_ref_part  stencil_ref;
-
-       /* 0 = without stencil buffer, 1 = when both Z and S buffers are present */
-       struct si_dsa_order_invariance  order_invariance[2];
-
-       ubyte                           alpha_func:3;
-       bool                            depth_enabled:1;
-       bool                            depth_write_enabled:1;
-       bool                            stencil_enabled:1;
-       bool                            stencil_write_enabled:1;
-       bool                            db_can_write:1;
-
+   struct si_pm4_state pm4;
+   struct si_dsa_stencil_ref_part stencil_ref;
+
+   /* 0 = without stencil buffer, 1 = when both Z and S buffers are present */
+   struct si_dsa_order_invariance order_invariance[2];
+
+   ubyte alpha_func : 3;
+   bool depth_enabled : 1;
+   bool depth_write_enabled : 1;
+   bool stencil_enabled : 1;
+   bool stencil_write_enabled : 1;
+   bool db_can_write : 1;
  };
  
  struct si_stencil_ref {
-       struct pipe_stencil_ref         state;
-       struct si_dsa_stencil_ref_part  dsa_part;
+   struct pipe_stencil_ref state;
+   struct si_dsa_stencil_ref_part dsa_part;
  };
  
-struct si_vertex_elements
-{
-       struct si_resource              *instance_divisor_factor_buffer;
-       uint32_t                        rsrc_word3[SI_MAX_ATTRIBS];
-       uint16_t                        src_offset[SI_MAX_ATTRIBS];
-       uint8_t                         fix_fetch[SI_MAX_ATTRIBS];
-       uint8_t                         format_size[SI_MAX_ATTRIBS];
-       uint8_t                         vertex_buffer_index[SI_MAX_ATTRIBS];
-
-       /* Bitmask of elements that always need a fixup to be applied. */
-       uint16_t                        fix_fetch_always;
-
-       /* Bitmask of elements whose fetch should always be opencoded. */
-       uint16_t                        fix_fetch_opencode;
-
-       /* Bitmask of elements which need to be opencoded if the vertex buffer
-        * is unaligned. */
-       uint16_t                        fix_fetch_unaligned;
-
-       /* For elements in fix_fetch_unaligned: whether the effective
-        * element load size as seen by the hardware is a dword (as opposed
-        * to a short).
-        */
-       uint16_t                        hw_load_is_dword;
-
-       /* Bitmask of vertex buffers requiring alignment check */
-       uint16_t                        vb_alignment_check_mask;
-
-       uint8_t                         count;
-       bool                            uses_instance_divisors;
-
-       uint16_t                        first_vb_use_mask;
-       /* Vertex buffer descriptor list size aligned for optimal prefetch. */
-       uint16_t                        vb_desc_list_alloc_size;
-       uint16_t                        instance_divisor_is_one; /* bitmask of inputs */
-       uint16_t                        instance_divisor_is_fetched;  /* bitmask of inputs */
+struct si_vertex_elements {
+   struct si_resource *instance_divisor_factor_buffer;
+   uint32_t rsrc_word3[SI_MAX_ATTRIBS];
+   uint16_t src_offset[SI_MAX_ATTRIBS];
+   uint8_t fix_fetch[SI_MAX_ATTRIBS];
+   uint8_t format_size[SI_MAX_ATTRIBS];
+   uint8_t vertex_buffer_index[SI_MAX_ATTRIBS];
+
+   /* Bitmask of elements that always need a fixup to be applied. */
+   uint16_t fix_fetch_always;
+
+   /* Bitmask of elements whose fetch should always be opencoded. */
+   uint16_t fix_fetch_opencode;
+
+   /* Bitmask of elements which need to be opencoded if the vertex buffer
+    * is unaligned. */
+   uint16_t fix_fetch_unaligned;
+
+   /* For elements in fix_fetch_unaligned: whether the effective
+    * element load size as seen by the hardware is a dword (as opposed
+    * to a short).
+    */
+   uint16_t hw_load_is_dword;
+
+   /* Bitmask of vertex buffers requiring alignment check */
+   uint16_t vb_alignment_check_mask;
+
+   uint8_t count;
+   bool uses_instance_divisors;
+
+   uint16_t first_vb_use_mask;
+   /* Vertex buffer descriptor list size aligned for optimal prefetch. */
+   uint16_t vb_desc_list_alloc_size;
+   uint16_t instance_divisor_is_one;     /* bitmask of inputs */
+   uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
  };
  
  union si_state {
-       struct {
-               struct si_state_blend           *blend;
-               struct si_state_rasterizer      *rasterizer;
-               struct si_state_dsa             *dsa;
-               struct si_pm4_state             *poly_offset;
-               struct si_pm4_state             *ls;
-               struct si_pm4_state             *hs;
-               struct si_pm4_state             *es;
-               struct si_pm4_state             *gs;
-               struct si_pm4_state             *vgt_shader_config;
-               struct si_pm4_state             *vs;
-               struct si_pm4_state             *ps;
-       } named;
-       struct si_pm4_state     *array[0];
+   struct {
+      struct si_state_blend *blend;
+      struct si_state_rasterizer *rasterizer;
+      struct si_state_dsa *dsa;
+      struct si_pm4_state *poly_offset;
+      struct si_pm4_state *ls;
+      struct si_pm4_state *hs;
+      struct si_pm4_state *es;
+      struct si_pm4_state *gs;
+      struct si_pm4_state *vgt_shader_config;
+      struct si_pm4_state *vs;
+      struct si_pm4_state *ps;
+   } named;
+   struct si_pm4_state *array[0];
  };
  
-#define SI_STATE_IDX(name) \
-       (offsetof(union si_state, named.name) / sizeof(struct si_pm4_state *))
+#define SI_STATE_IDX(name) (offsetof(union si_state, named.name) / sizeof(struct si_pm4_state *))
  #define SI_STATE_BIT(name) (1 << SI_STATE_IDX(name))
-#define SI_NUM_STATES (sizeof(union si_state) / sizeof(struct si_pm4_state *))
+#define SI_NUM_STATES      (sizeof(union si_state) / sizeof(struct si_pm4_state *))
  
  static inline unsigned si_states_that_always_roll_context(void)
  {
-       return (SI_STATE_BIT(blend) |
-               SI_STATE_BIT(rasterizer) |
-               SI_STATE_BIT(dsa) |
-               SI_STATE_BIT(poly_offset) |
-               SI_STATE_BIT(vgt_shader_config));
+   return (SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(dsa) |
+           SI_STATE_BIT(poly_offset) | SI_STATE_BIT(vgt_shader_config));
  }
  
  union si_state_atoms {
-       struct {
-               /* The order matters. */
-               struct si_atom render_cond;
-               struct si_atom streamout_begin;
-               struct si_atom streamout_enable; /* must be after streamout_begin */
-               struct si_atom framebuffer;
-               struct si_atom msaa_sample_locs;
-               struct si_atom db_render_state;
-               struct si_atom dpbb_state;
-               struct si_atom msaa_config;
-               struct si_atom sample_mask;
-               struct si_atom cb_render_state;
-               struct si_atom blend_color;
-               struct si_atom clip_regs;
-               struct si_atom clip_state;
-               struct si_atom shader_pointers;
-               struct si_atom guardband;
-               struct si_atom scissors;
-               struct si_atom viewports;
-               struct si_atom stencil_ref;
-               struct si_atom spi_map;
-               struct si_atom scratch_state;
-               struct si_atom window_rectangles;
-               struct si_atom shader_query;
-       } s;
-       struct si_atom array[0];
+   struct {
+      /* The order matters. */
+      struct si_atom render_cond;
+      struct si_atom streamout_begin;
+      struct si_atom streamout_enable; /* must be after streamout_begin */
+      struct si_atom framebuffer;
+      struct si_atom msaa_sample_locs;
+      struct si_atom db_render_state;
+      struct si_atom dpbb_state;
+      struct si_atom msaa_config;
+      struct si_atom sample_mask;
+      struct si_atom cb_render_state;
+      struct si_atom blend_color;
+      struct si_atom clip_regs;
+      struct si_atom clip_state;
+      struct si_atom shader_pointers;
+      struct si_atom guardband;
+      struct si_atom scissors;
+      struct si_atom viewports;
+      struct si_atom stencil_ref;
+      struct si_atom spi_map;
+      struct si_atom scratch_state;
+      struct si_atom window_rectangles;
+      struct si_atom shader_query;
+   } s;
+   struct si_atom array[0];
  };
  
-#define SI_ATOM_BIT(name) (1 << (offsetof(union si_state_atoms, s.name) / \
-                                sizeof(struct si_atom)))
-#define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct si_atom*))
+#define SI_ATOM_BIT(name) (1 << (offsetof(union si_state_atoms, s.name) / sizeof(struct si_atom)))
+#define SI_NUM_ATOMS      (sizeof(union si_state_atoms) / sizeof(struct si_atom *))
  
  static inline unsigned si_atoms_that_always_roll_context(void)
  {
-       return (SI_ATOM_BIT(streamout_begin) |
-               SI_ATOM_BIT(streamout_enable) |
-               SI_ATOM_BIT(framebuffer) |
-               SI_ATOM_BIT(msaa_sample_locs) |
-               SI_ATOM_BIT(sample_mask) |
-               SI_ATOM_BIT(blend_color) |
-               SI_ATOM_BIT(clip_state) |
-               SI_ATOM_BIT(scissors) |
-               SI_ATOM_BIT(viewports) |
-               SI_ATOM_BIT(stencil_ref) |
-               SI_ATOM_BIT(scratch_state) |
-               SI_ATOM_BIT(window_rectangles));
+   return (SI_ATOM_BIT(streamout_begin) | SI_ATOM_BIT(streamout_enable) | SI_ATOM_BIT(framebuffer) |
+           SI_ATOM_BIT(msaa_sample_locs) | SI_ATOM_BIT(sample_mask) | SI_ATOM_BIT(blend_color) |
+           SI_ATOM_BIT(clip_state) | SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports) |
+           SI_ATOM_BIT(stencil_ref) | SI_ATOM_BIT(scratch_state) | SI_ATOM_BIT(window_rectangles));
  }
  
  struct si_shader_data {
-       uint32_t                sh_base[SI_NUM_SHADERS];
+   uint32_t sh_base[SI_NUM_SHADERS];
  };
  
-#define SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK \
-       (S_02881C_USE_VTX_POINT_SIZE(1) | \
-        S_02881C_USE_VTX_EDGE_FLAG(1) | \
-        S_02881C_USE_VTX_RENDER_TARGET_INDX(1) | \
-        S_02881C_USE_VTX_VIEWPORT_INDX(1) | \
-        S_02881C_VS_OUT_MISC_VEC_ENA(1) | \
-        S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1))
+#define SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK                                                      \
+   (S_02881C_USE_VTX_POINT_SIZE(1) | S_02881C_USE_VTX_EDGE_FLAG(1) |                               \
+    S_02881C_USE_VTX_RENDER_TARGET_INDX(1) | S_02881C_USE_VTX_VIEWPORT_INDX(1) |                   \
+    S_02881C_VS_OUT_MISC_VEC_ENA(1) | S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1))
  
  /* The list of registers whose emitted values are remembered by si_context. */
-enum si_tracked_reg {
-       SI_TRACKED_DB_RENDER_CONTROL, /* 2 consecutive registers */
-       SI_TRACKED_DB_COUNT_CONTROL,
+enum si_tracked_reg
+{
+   SI_TRACKED_DB_RENDER_CONTROL, /* 2 consecutive registers */
+   SI_TRACKED_DB_COUNT_CONTROL,
  
-       SI_TRACKED_DB_RENDER_OVERRIDE2,
-       SI_TRACKED_DB_SHADER_CONTROL,
+   SI_TRACKED_DB_RENDER_OVERRIDE2,
+   SI_TRACKED_DB_SHADER_CONTROL,
  
-       SI_TRACKED_CB_TARGET_MASK,
-       SI_TRACKED_CB_DCC_CONTROL,
+   SI_TRACKED_CB_TARGET_MASK,
+   SI_TRACKED_CB_DCC_CONTROL,
  
-       SI_TRACKED_SX_PS_DOWNCONVERT, /* 3 consecutive registers */
-       SI_TRACKED_SX_BLEND_OPT_EPSILON,
-       SI_TRACKED_SX_BLEND_OPT_CONTROL,
+   SI_TRACKED_SX_PS_DOWNCONVERT, /* 3 consecutive registers */
+   SI_TRACKED_SX_BLEND_OPT_EPSILON,
+   SI_TRACKED_SX_BLEND_OPT_CONTROL,
  
-       SI_TRACKED_PA_SC_LINE_CNTL, /* 2 consecutive registers */
-       SI_TRACKED_PA_SC_AA_CONFIG,
+   SI_TRACKED_PA_SC_LINE_CNTL, /* 2 consecutive registers */
+   SI_TRACKED_PA_SC_AA_CONFIG,
  
-       SI_TRACKED_DB_EQAA,
-       SI_TRACKED_PA_SC_MODE_CNTL_1,
+   SI_TRACKED_DB_EQAA,
+   SI_TRACKED_PA_SC_MODE_CNTL_1,
  
-       SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
-       SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
+   SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
+   SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
  
-       SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, /* set with SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK*/
-       SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, /* set with ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK */
-       SI_TRACKED_PA_CL_CLIP_CNTL,
+   SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, /* set with SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK*/
+   SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, /* set with ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK */
+   SI_TRACKED_PA_CL_CLIP_CNTL,
  
-       SI_TRACKED_PA_SC_BINNER_CNTL_0,
-       SI_TRACKED_DB_DFSM_CONTROL,
+   SI_TRACKED_PA_SC_BINNER_CNTL_0,
+   SI_TRACKED_DB_DFSM_CONTROL,
  
-       SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, /* 4 consecutive registers */
-       SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ,
-       SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ,
-       SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ,
+   SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, /* 4 consecutive registers */
+   SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ,
+   SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ,
+   SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ,
  
-       SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
-       SI_TRACKED_PA_SU_VTX_CNTL,
+   SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
+   SI_TRACKED_PA_SU_VTX_CNTL,
  
-       SI_TRACKED_PA_SC_CLIPRECT_RULE,
+   SI_TRACKED_PA_SC_CLIPRECT_RULE,
  
-       SI_TRACKED_PA_SC_LINE_STIPPLE,
+   SI_TRACKED_PA_SC_LINE_STIPPLE,
  
-       SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+   SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
  
-       SI_TRACKED_VGT_GSVS_RING_OFFSET_1, /* 3 consecutive registers */
-       SI_TRACKED_VGT_GSVS_RING_OFFSET_2,
-       SI_TRACKED_VGT_GSVS_RING_OFFSET_3,
+   SI_TRACKED_VGT_GSVS_RING_OFFSET_1, /* 3 consecutive registers */
+   SI_TRACKED_VGT_GSVS_RING_OFFSET_2,
+   SI_TRACKED_VGT_GSVS_RING_OFFSET_3,
  
-       SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
-       SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+   SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
+   SI_TRACKED_VGT_GS_MAX_VERT_OUT,
  
-       SI_TRACKED_VGT_GS_VERT_ITEMSIZE, /* 4 consecutive registers */
-       SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1,
-       SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2,
-       SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3,
+   SI_TRACKED_VGT_GS_VERT_ITEMSIZE, /* 4 consecutive registers */
+   SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1,
+   SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2,
+   SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3,
  
-       SI_TRACKED_VGT_GS_INSTANCE_CNT,
-       SI_TRACKED_VGT_GS_ONCHIP_CNTL,
-       SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
-       SI_TRACKED_VGT_GS_MODE,
-       SI_TRACKED_VGT_PRIMITIVEID_EN,
-       SI_TRACKED_VGT_REUSE_OFF,
-       SI_TRACKED_SPI_VS_OUT_CONFIG,
-       SI_TRACKED_PA_CL_VTE_CNTL,
-       SI_TRACKED_PA_CL_NGG_CNTL,
-       SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
-       SI_TRACKED_GE_NGG_SUBGRP_CNTL,
+   SI_TRACKED_VGT_GS_INSTANCE_CNT,
+   SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+   SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+   SI_TRACKED_VGT_GS_MODE,
+   SI_TRACKED_VGT_PRIMITIVEID_EN,
+   SI_TRACKED_VGT_REUSE_OFF,
+   SI_TRACKED_SPI_VS_OUT_CONFIG,
+   SI_TRACKED_PA_CL_VTE_CNTL,
+   SI_TRACKED_PA_CL_NGG_CNTL,
+   SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
+   SI_TRACKED_GE_NGG_SUBGRP_CNTL,
  
-       SI_TRACKED_SPI_SHADER_IDX_FORMAT, /* 2 consecutive registers */
-       SI_TRACKED_SPI_SHADER_POS_FORMAT,
+   SI_TRACKED_SPI_SHADER_IDX_FORMAT, /* 2 consecutive registers */
+   SI_TRACKED_SPI_SHADER_POS_FORMAT,
  
-       SI_TRACKED_SPI_PS_INPUT_ENA, /* 2 consecutive registers */
-       SI_TRACKED_SPI_PS_INPUT_ADDR,
+   SI_TRACKED_SPI_PS_INPUT_ENA, /* 2 consecutive registers */
+   SI_TRACKED_SPI_PS_INPUT_ADDR,
  
-       SI_TRACKED_SPI_BARYC_CNTL,
-       SI_TRACKED_SPI_PS_IN_CONTROL,
+   SI_TRACKED_SPI_BARYC_CNTL,
+   SI_TRACKED_SPI_PS_IN_CONTROL,
  
-       SI_TRACKED_SPI_SHADER_Z_FORMAT, /* 2 consecutive registers */
-       SI_TRACKED_SPI_SHADER_COL_FORMAT,
+   SI_TRACKED_SPI_SHADER_Z_FORMAT, /* 2 consecutive registers */
+   SI_TRACKED_SPI_SHADER_COL_FORMAT,
  
-       SI_TRACKED_CB_SHADER_MASK,
-       SI_TRACKED_VGT_TF_PARAM,
-       SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+   SI_TRACKED_CB_SHADER_MASK,
+   SI_TRACKED_VGT_TF_PARAM,
+   SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
  
-       SI_TRACKED_GE_PC_ALLOC,
+   SI_TRACKED_GE_PC_ALLOC,
  
-       SI_NUM_TRACKED_REGS,
+   SI_NUM_TRACKED_REGS,
  };
  
  struct si_tracked_regs {
-       uint64_t                reg_saved;
-       uint32_t                reg_value[SI_NUM_TRACKED_REGS];
-       uint32_t                spi_ps_input_cntl[32];
+   uint64_t reg_saved;
+   uint32_t reg_value[SI_NUM_TRACKED_REGS];
+   uint32_t spi_ps_input_cntl[32];
  };
  
  /* Private read-write buffer slots. */
-enum {
-       SI_ES_RING_ESGS,
-       SI_GS_RING_ESGS,
+enum
+{
+   SI_ES_RING_ESGS,
+   SI_GS_RING_ESGS,
  
-       SI_RING_GSVS,
+   SI_RING_GSVS,
  
-       SI_VS_STREAMOUT_BUF0,
-       SI_VS_STREAMOUT_BUF1,
-       SI_VS_STREAMOUT_BUF2,
-       SI_VS_STREAMOUT_BUF3,
+   SI_VS_STREAMOUT_BUF0,
+   SI_VS_STREAMOUT_BUF1,
+   SI_VS_STREAMOUT_BUF2,
+   SI_VS_STREAMOUT_BUF3,
  
-       SI_HS_CONST_DEFAULT_TESS_LEVELS,
-       SI_VS_CONST_INSTANCE_DIVISORS,
-       SI_VS_CONST_CLIP_PLANES,
-       SI_PS_CONST_POLY_STIPPLE,
-       SI_PS_CONST_SAMPLE_POSITIONS,
+   SI_HS_CONST_DEFAULT_TESS_LEVELS,
+   SI_VS_CONST_INSTANCE_DIVISORS,
+   SI_VS_CONST_CLIP_PLANES,
+   SI_PS_CONST_POLY_STIPPLE,
+   SI_PS_CONST_SAMPLE_POSITIONS,
  
-       /* Image descriptor of color buffer 0 for KHR_blend_equation_advanced. */
-       SI_PS_IMAGE_COLORBUF0,
-       SI_PS_IMAGE_COLORBUF0_HI,
-       SI_PS_IMAGE_COLORBUF0_FMASK,
-       SI_PS_IMAGE_COLORBUF0_FMASK_HI,
+   /* Image descriptor of color buffer 0 for KHR_blend_equation_advanced. */
+   SI_PS_IMAGE_COLORBUF0,
+   SI_PS_IMAGE_COLORBUF0_HI,
+   SI_PS_IMAGE_COLORBUF0_FMASK,
+   SI_PS_IMAGE_COLORBUF0_FMASK_HI,
  
-       GFX10_GS_QUERY_BUF,
+   GFX10_GS_QUERY_BUF,
  
-       SI_NUM_RW_BUFFERS,
+   SI_NUM_RW_BUFFERS,
  };
  
  /* Indices into sctx->descriptors, laid out so that gfx and compute pipelines
@@ -406,122 +389,111 @@ enum {
   *  11 - compute const and shader buffers
   *  12 - compute samplers and images
   */
-enum {
-       SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
-       SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
-       SI_NUM_SHADER_DESCS,
+enum
+{
+   SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+   SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+   SI_NUM_SHADER_DESCS,
  };
  
-#define SI_DESCS_RW_BUFFERS            0
-#define SI_DESCS_FIRST_SHADER          1
-#define SI_DESCS_FIRST_COMPUTE         (SI_DESCS_FIRST_SHADER + \
-                                        PIPE_SHADER_COMPUTE * SI_NUM_SHADER_DESCS)
-#define SI_NUM_DESCS                   (SI_DESCS_FIRST_SHADER + \
-                                        SI_NUM_SHADERS * SI_NUM_SHADER_DESCS)
+#define SI_DESCS_RW_BUFFERS    0
+#define SI_DESCS_FIRST_SHADER  1
+#define SI_DESCS_FIRST_COMPUTE (SI_DESCS_FIRST_SHADER + PIPE_SHADER_COMPUTE * SI_NUM_SHADER_DESCS)
+#define SI_NUM_DESCS           (SI_DESCS_FIRST_SHADER + SI_NUM_SHADERS * SI_NUM_SHADER_DESCS)
  
-#define SI_DESCS_SHADER_MASK(name) \
-       u_bit_consecutive(SI_DESCS_FIRST_SHADER + \
-                         PIPE_SHADER_##name * SI_NUM_SHADER_DESCS, \
-                         SI_NUM_SHADER_DESCS)
+#define SI_DESCS_SHADER_MASK(name)                                                                 \
+   u_bit_consecutive(SI_DESCS_FIRST_SHADER + PIPE_SHADER_##name * SI_NUM_SHADER_DESCS,             \
+                     SI_NUM_SHADER_DESCS)
  
-static inline unsigned
-si_const_and_shader_buffer_descriptors_idx(unsigned shader)
+static inline unsigned si_const_and_shader_buffer_descriptors_idx(unsigned shader)
  {
-       return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
-              SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS;
+   return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
+          SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS;
  }
  
-static inline unsigned
-si_sampler_and_image_descriptors_idx(unsigned shader)
+static inline unsigned si_sampler_and_image_descriptors_idx(unsigned shader)
  {
-       return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
-              SI_SHADER_DESCS_SAMPLERS_AND_IMAGES;
+   return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
+          SI_SHADER_DESCS_SAMPLERS_AND_IMAGES;
  }
  
  /* This represents descriptors in memory, such as buffer resources,
   * image resources, and sampler states.
   */
  struct si_descriptors {
-       /* The list of descriptors in malloc'd memory. */
-       uint32_t *list;
-       /* The list in mapped GPU memory. */
-       uint32_t *gpu_list;
-
-       /* The buffer where the descriptors have been uploaded. */
-       struct si_resource *buffer;
-       uint64_t gpu_address;
-
-       /* The maximum number of descriptors. */
-       uint32_t num_elements;
-
-       /* Slots that are used by currently-bound shaders.
-        * It determines which slots are uploaded.
-        */
-       uint32_t first_active_slot;
-       uint32_t num_active_slots;
-
-       /* The SH register offset relative to USER_DATA*_0 where the pointer
-        * to the descriptor array will be stored. */
-       short shader_userdata_offset;
-       /* The size of one descriptor. */
-       ubyte element_dw_size;
-       /* If there is only one slot enabled, bind it directly instead of
-        * uploading descriptors. -1 if disabled. */
-       signed char slot_index_to_bind_directly;
+   /* The list of descriptors in malloc'd memory. */
+   uint32_t *list;
+   /* The list in mapped GPU memory. */
+   uint32_t *gpu_list;
+
+   /* The buffer where the descriptors have been uploaded. */
+   struct si_resource *buffer;
+   uint64_t gpu_address;
+
+   /* The maximum number of descriptors. */
+   uint32_t num_elements;
+
+   /* Slots that are used by currently-bound shaders.
+    * It determines which slots are uploaded.
+    */
+   uint32_t first_active_slot;
+   uint32_t num_active_slots;
+
+   /* The SH register offset relative to USER_DATA*_0 where the pointer
+    * to the descriptor array will be stored. */
+   short shader_userdata_offset;
+   /* The size of one descriptor. */
+   ubyte element_dw_size;
+   /* If there is only one slot enabled, bind it directly instead of
+    * uploading descriptors. -1 if disabled. */
+   signed char slot_index_to_bind_directly;
  };
  
  struct si_buffer_resources {
-       struct pipe_resource            **buffers; /* this has num_buffers elements */
-       unsigned                        *offsets; /* this has num_buffers elements */
+   struct pipe_resource **buffers; /* this has num_buffers elements */
+   unsigned *offsets;              /* this has num_buffers elements */
  
-       enum radeon_bo_priority         priority:6;
-       enum radeon_bo_priority         priority_constbuf:6;
+   enum radeon_bo_priority priority : 6;
+   enum radeon_bo_priority priority_constbuf : 6;
  
-       /* The i-th bit is set if that element is enabled (non-NULL resource). */
-       unsigned                        enabled_mask;
-       unsigned                        writable_mask;
+   /* The i-th bit is set if that element is enabled (non-NULL resource). */
+   unsigned enabled_mask;
+   unsigned writable_mask;
  };
  
-#define si_pm4_state_changed(sctx, member) \
-       ((sctx)->queued.named.member != (sctx)->emitted.named.member)
+#define si_pm4_state_changed(sctx, member)                                                         \
+   ((sctx)->queued.named.member != (sctx)->emitted.named.member)
  
-#define si_pm4_state_enabled_and_changed(sctx, member) \
-       ((sctx)->queued.named.member && si_pm4_state_changed(sctx, member))
+#define si_pm4_state_enabled_and_changed(sctx, member)                                             \
+   ((sctx)->queued.named.member && si_pm4_state_changed(sctx, member))
  
-#define si_pm4_bind_state(sctx, member, value) \
-       do { \
-               (sctx)->queued.named.member = (value); \
-               (sctx)->dirty_states |= SI_STATE_BIT(member); \
-       } while(0)
+#define si_pm4_bind_state(sctx, member, value)                                                     \
+   do {                                                                                            \
+      (sctx)->queued.named.member = (value);                                                       \
+      (sctx)->dirty_states |= SI_STATE_BIT(member);                                                \
+   } while (0)
  
-#define si_pm4_delete_state(sctx, member, value) \
-       do { \
-               if ((sctx)->queued.named.member == (value)) { \
-                       (sctx)->queued.named.member = NULL; \
-               } \
-               si_pm4_free_state(sctx, (struct si_pm4_state *)(value), \
-                                 SI_STATE_IDX(member)); \
-       } while(0)
+#define si_pm4_delete_state(sctx, member, value)                                                   \
+   do {                                                                                            \
+      if ((sctx)->queued.named.member == (value)) {                                                \
+         (sctx)->queued.named.member = NULL;                                                       \
+      }                                                                                            \
+      si_pm4_free_state(sctx, (struct si_pm4_state *)(value), SI_STATE_IDX(member));               \
+   } while (0)
  
  /* si_descriptors.c */
-void si_set_mutable_tex_desc_fields(struct si_screen *sscreen,
-                                   struct si_texture *tex,
-                                   const struct legacy_surf_level *base_level_info,
-                                   unsigned base_level, unsigned first_level,
-                                   unsigned block_width, bool is_stencil,
-                                   uint32_t *state);
+void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex,
+                                    const struct legacy_surf_level *base_level_info,
+                                    unsigned base_level, unsigned first_level, unsigned block_width,
+                                    bool is_stencil, uint32_t *state);
  void si_update_ps_colorbuf0_slot(struct si_context *sctx);
-void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader,
-                                uint slot, struct pipe_constant_buffer *cbuf);
-void si_get_shader_buffers(struct si_context *sctx,
-                          enum pipe_shader_type shader,
-                          uint start_slot, uint count,
-                          struct pipe_shader_buffer *sbuf);
-void si_set_ring_buffer(struct si_context *sctx, uint slot,
-                       struct pipe_resource *buffer,
-                       unsigned stride, unsigned num_records,
-                       bool add_tid, bool swizzle,
-                       unsigned element_size, unsigned index_stride, uint64_t offset);
+void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot,
+                                 struct pipe_constant_buffer *cbuf);
+void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot,
+                           uint count, struct pipe_shader_buffer *sbuf);
+void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource *buffer,
+                        unsigned stride, unsigned num_records, bool add_tid, bool swizzle,
+                        unsigned element_size, unsigned index_stride, uint64_t offset);
  void si_init_all_descriptors(struct si_context *sctx);
  bool si_upload_vertex_buffer_descriptors(struct si_context *sctx);
  bool si_upload_graphics_shader_descriptors(struct si_context *sctx);
@@ -530,102 +502,84 @@ void si_release_all_descriptors(struct si_context *sctx);
  void si_gfx_resources_add_all_to_bo_list(struct si_context *sctx);
  void si_compute_resources_add_all_to_bo_list(struct si_context *sctx);
  void si_all_descriptors_begin_new_cs(struct si_context *sctx);
-void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf,
-                           const uint8_t *ptr, unsigned size, uint32_t *const_offset);
+void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf, const uint8_t *ptr,
+                            unsigned size, uint32_t *const_offset);
  void si_update_all_texture_descriptors(struct si_context *sctx);
  void si_shader_change_notify(struct si_context *sctx);
  void si_update_needs_color_decompress_masks(struct si_context *sctx);
  void si_emit_graphics_shader_pointers(struct si_context *sctx);
  void si_emit_compute_shader_pointers(struct si_context *sctx);
-void si_set_rw_buffer(struct si_context *sctx,
-                     uint slot, const struct pipe_constant_buffer *input);
+void si_set_rw_buffer(struct si_context *sctx, uint slot, const struct pipe_constant_buffer *input);
  void si_set_rw_shader_buffer(struct si_context *sctx, uint slot,
-                            const struct pipe_shader_buffer *sbuffer);
+                             const struct pipe_shader_buffer *sbuffer);
  void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
-                              uint64_t new_active_mask);
-void si_set_active_descriptors_for_shader(struct si_context *sctx,
-                                         struct si_shader_selector *sel);
-bool si_bindless_descriptor_can_reclaim_slab(void *priv,
-                                            struct pb_slab_entry *entry);
-struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap,
-                                                 unsigned entry_size,
-                                                 unsigned group_index);
+                               uint64_t new_active_mask);
+void si_set_active_descriptors_for_shader(struct si_context *sctx, struct si_shader_selector *sel);
+bool si_bindless_descriptor_can_reclaim_slab(void *priv, struct pb_slab_entry *entry);
+struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap, unsigned entry_size,
+                                                  unsigned group_index);
  void si_bindless_descriptor_slab_free(void *priv, struct pb_slab *pslab);
  void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf);
  /* si_state.c */
  void si_init_state_compute_functions(struct si_context *sctx);
  void si_init_state_functions(struct si_context *sctx);
  void si_init_screen_state_functions(struct si_screen *sscreen);
-void
-si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
-                         enum pipe_format format,
-                         unsigned offset, unsigned size,
-                         uint32_t *state);
-struct pipe_sampler_view *
-si_create_sampler_view_custom(struct pipe_context *ctx,
-                             struct pipe_resource *texture,
-                             const struct pipe_sampler_view *state,
-                             unsigned width0, unsigned height0,
-                             unsigned force_level);
+void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
+                               enum pipe_format format, unsigned offset, unsigned size,
+                               uint32_t *state);
+struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx,
+                                                        struct pipe_resource *texture,
+                                                        const struct pipe_sampler_view *state,
+                                                        unsigned width0, unsigned height0,
+                                                        unsigned force_level);
  void si_update_fb_dirtiness_after_rendering(struct si_context *sctx);
  void si_update_ps_iter_samples(struct si_context *sctx);
  void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st);
  void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st);
-void si_set_occlusion_query_state(struct si_context *sctx,
-                                 bool old_perfect_enable);
+void si_set_occlusion_query_state(struct si_context *sctx, bool old_perfect_enable);
  
  struct si_fast_udiv_info32 {
     unsigned multiplier; /* the "magic number" multiplier */
-   unsigned pre_shift; /* shift for the dividend before multiplying */
+   unsigned pre_shift;  /* shift for the dividend before multiplying */
     unsigned post_shift; /* shift for the dividend after multiplying */
-   int increment; /* 0 or 1; if set then increment the numerator, using one of
-                     the two strategies */
+   int increment;       /* 0 or 1; if set then increment the numerator, using one of
+                           the two strategies */
  };
  
-struct si_fast_udiv_info32
-si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits);
+struct si_fast_udiv_info32 si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits);
  
  /* si_state_binning.c */
  void si_emit_dpbb_state(struct si_context *sctx);
  
  /* si_state_shaders.c */
  void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
-                        unsigned char ir_sha1_cache_key[20]);
-bool si_shader_cache_load_shader(struct si_screen *sscreen,
-                                unsigned char ir_sha1_cache_key[20],
-                                struct si_shader *shader);
-void si_shader_cache_insert_shader(struct si_screen *sscreen,
-                                  unsigned char ir_sha1_cache_key[20],
-                                  struct si_shader *shader,
-                                  bool insert_into_disk_cache);
+                         unsigned char ir_sha1_cache_key[20]);
+bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
+                                 struct si_shader *shader);
+void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
+                                   struct si_shader *shader, bool insert_into_disk_cache);
  bool si_update_shaders(struct si_context *sctx);
  void si_init_screen_live_shader_cache(struct si_screen *sscreen);
  void si_init_shader_functions(struct si_context *sctx);
  bool si_init_shader_cache(struct si_screen *sscreen);
  void si_destroy_shader_cache(struct si_screen *sscreen);
  void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
-                                struct util_queue_fence *ready_fence,
-                                struct si_compiler_ctx_state *compiler_ctx_state,
-                                void *job, util_queue_execute_func execute);
-void si_get_active_slot_masks(const struct si_shader_info *info,
-                             uint32_t *const_and_shader_buffers,
-                             uint64_t *samplers_and_images);
-int si_shader_select_with_key(struct si_screen *sscreen,
-                             struct si_shader_ctx_state *state,
-                             struct si_compiler_ctx_state *compiler_state,
-                             struct si_shader_key *key,
-                             int thread_index,
-                             bool optimized_or_none);
-void si_shader_selector_key_vs(struct si_context *sctx,
-                              struct si_shader_selector *vs,
-                              struct si_shader_key *key,
-                              struct si_vs_prolog_bits *prolog_key);
+                                 struct util_queue_fence *ready_fence,
+                                 struct si_compiler_ctx_state *compiler_ctx_state, void *job,
+                                 util_queue_execute_func execute);
+void si_get_active_slot_masks(const struct si_shader_info *info, uint32_t *const_and_shader_buffers,
+                              uint64_t *samplers_and_images);
+int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state,
+                              struct si_compiler_ctx_state *compiler_state,
+                              struct si_shader_key *key, int thread_index, bool optimized_or_none);
+void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs,
+                               struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key);
  unsigned si_get_input_prim(const struct si_shader_selector *gs);
  bool si_update_ngg(struct si_context *sctx);
  
  /* si_state_draw.c */
  void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
-                         unsigned cp_coher_cntl);
+                          unsigned cp_coher_cntl);
  void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
  void gfx10_emit_cache_flush(struct si_context *sctx);
  void si_emit_cache_flush(struct si_context *sctx);
@@ -639,35 +593,33 @@ void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples);
  /* si_state_streamout.c */
  void si_streamout_buffers_dirty(struct si_context *sctx);
  void si_emit_streamout_end(struct si_context *sctx);
-void si_update_prims_generated_query_state(struct si_context *sctx,
-                                          unsigned type, int diff);
+void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff);
  void si_init_streamout_functions(struct si_context *sctx);
  
-
  static inline unsigned si_get_constbuf_slot(unsigned slot)
  {
-       /* Constant buffers are in slots [16..31], ascending */
-       return SI_NUM_SHADER_BUFFERS + slot;
+   /* Constant buffers are in slots [16..31], ascending */
+   return SI_NUM_SHADER_BUFFERS + slot;
  }
  
  static inline unsigned si_get_shaderbuf_slot(unsigned slot)
  {
-       /* shader buffers are in slots [15..0], descending */
-       return SI_NUM_SHADER_BUFFERS - 1 - slot;
+   /* shader buffers are in slots [15..0], descending */
+   return SI_NUM_SHADER_BUFFERS - 1 - slot;
  }
  
  static inline unsigned si_get_sampler_slot(unsigned slot)
  {
-       /* 32 samplers are in sampler slots [16..47], 16 dw per slot, ascending */
-       /* those are equivalent to image slots [32..95], 8 dw per slot, ascending  */
-       return SI_NUM_IMAGE_SLOTS / 2 + slot;
+   /* 32 samplers are in sampler slots [16..47], 16 dw per slot, ascending */
+   /* those are equivalent to image slots [32..95], 8 dw per slot, ascending  */
+   return SI_NUM_IMAGE_SLOTS / 2 + slot;
  }
  
  static inline unsigned si_get_image_slot(unsigned slot)
  {
-       /* image slots are in [31..0] (sampler slots [15..0]), descending */
-       /* images are in slots [31..16], while FMASKs are in slots [15..0] */
-       return SI_NUM_IMAGE_SLOTS - 1 - slot;
+   /* image slots are in [31..0] (sampler slots [15..0]), descending */
+   /* images are in slots [31..16], while FMASKs are in slots [15..0] */
+   return SI_NUM_IMAGE_SLOTS - 1 - slot;
  }
  
  #endif
diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c

index 1251b53785bdba654ca000803106221ce736b515..39bb94366f251457fcc9d1dce142f19fdf100b70 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_binning.c
+++ b/src/gallium/drivers/radeonsi/si_state_binning.c
@@ -28,577 +28,548 @@
  #include "sid.h"
  
  struct uvec2 {
-       unsigned x, y;
+   unsigned x, y;
  };
  
  struct si_bin_size_map {
-       unsigned start;
-       unsigned bin_size_x;
-       unsigned bin_size_y;
+   unsigned start;
+   unsigned bin_size_x;
+   unsigned bin_size_y;
  };
  
  typedef struct si_bin_size_map si_bin_size_subtable[3][10];
  
  /* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */
-static struct uvec2 si_find_bin_size(struct si_screen *sscreen,
-                                    const si_bin_size_subtable table[],
-                                    unsigned sum)
+static struct uvec2 si_find_bin_size(struct si_screen *sscreen, const si_bin_size_subtable table[],
+                                     unsigned sum)
  {
-       unsigned log_num_rb_per_se =
-               util_logbase2_ceil(sscreen->info.num_render_backends /
-                                  sscreen->info.max_se);
-       unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se);
-       unsigned i;
-
-       /* Get the chip-specific subtable. */
-       const struct si_bin_size_map *subtable =
-               &table[log_num_rb_per_se][log_num_se][0];
-
-       for (i = 0; subtable[i].bin_size_x != 0; i++) {
-               if (sum >= subtable[i].start && sum < subtable[i + 1].start)
-                       break;
-       }
-
-       struct uvec2 size = {subtable[i].bin_size_x, subtable[i].bin_size_y};
-       return size;
+   unsigned log_num_rb_per_se =
+      util_logbase2_ceil(sscreen->info.num_render_backends / sscreen->info.max_se);
+   unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se);
+   unsigned i;
+
+   /* Get the chip-specific subtable. */
+   const struct si_bin_size_map *subtable = &table[log_num_rb_per_se][log_num_se][0];
+
+   for (i = 0; subtable[i].bin_size_x != 0; i++) {
+      if (sum >= subtable[i].start && sum < subtable[i + 1].start)
+         break;
+   }
+
+   struct uvec2 size = {subtable[i].bin_size_x, subtable[i].bin_size_y};
+   return size;
  }
  
-static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
-                                         unsigned cb_target_enabled_4bit)
+static struct uvec2 si_get_color_bin_size(struct si_context *sctx, unsigned cb_target_enabled_4bit)
  {
-       unsigned num_fragments = sctx->framebuffer.nr_color_samples;
-       unsigned sum = 0;
-
-       /* Compute the sum of all Bpp. */
-       for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-               if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
-                       continue;
-
-               struct si_texture *tex =
-                       (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture;
-               sum += tex->surface.bpe;
-       }
-
-       /* Multiply the sum by some function of the number of samples. */
-       if (num_fragments >= 2) {
-               if (si_get_ps_iter_samples(sctx) >= 2)
-                       sum *= num_fragments;
-               else
-                       sum *= 2;
-       }
-
-       static const si_bin_size_subtable table[] = {
-               {
-                       /* One RB / SE */
-                       {
-                               /* One shader engine */
-                               {        0,  128,  128 },
-                               {        1,   64,  128 },
-                               {        2,   32,  128 },
-                               {        3,   16,  128 },
-                               {       17,    0,    0 },
-                       },
-                       {
-                               /* Two shader engines */
-                               {        0,  128,  128 },
-                               {        2,   64,  128 },
-                               {        3,   32,  128 },
-                               {        5,   16,  128 },
-                               {       17,    0,    0 },
-                       },
-                       {
-                               /* Four shader engines */
-                               {        0,  128,  128 },
-                               {        3,   64,  128 },
-                               {        5,   16,  128 },
-                               {       17,    0,    0 },
-                       },
-               },
-               {
-                       /* Two RB / SE */
-                       {
-                               /* One shader engine */
-                               {        0,  128,  128 },
-                               {        2,   64,  128 },
-                               {        3,   32,  128 },
-                               {        9,   16,  128 },
-                               {       33,    0,    0 },
-                       },
-                       {
-                               /* Two shader engines */
-                               {        0,  128,  128 },
-                               {        3,   64,  128 },
-                               {        5,   32,  128 },
-                               {        9,   16,  128 },
-                               {       33,    0,    0 },
-                       },
-                       {
-                               /* Four shader engines */
-                               {        0,  256,  256 },
-                               {        2,  128,  256 },
-                               {        3,  128,  128 },
-                               {        5,   64,  128 },
-                               {        9,   16,  128 },
-                               {       33,    0,    0 },
-                       },
-               },
-               {
-                       /* Four RB / SE */
-                       {
-                               /* One shader engine */
-                               {        0,  128,  256 },
-                               {        2,  128,  128 },
-                               {        3,   64,  128 },
-                               {        5,   32,  128 },
-                               {        9,   16,  128 },
-                               {       17,    0,    0 },
-                       },
-                       {
-                               /* Two shader engines */
-                               {        0,  256,  256 },
-                               {        2,  128,  256 },
-                               {        3,  128,  128 },
-                               {        5,   64,  128 },
-                               {        9,   32,  128 },
-                               {       17,   16,  128 },
-                               {       33,    0,    0 },
-                       },
-                       {
-                               /* Four shader engines */
-                               {        0,  256,  512 },
-                               {        2,  128,  512 },
-                               {        3,   64,  512 },
-                               {        5,   32,  512 },
-                               {        9,   32,  256 },
-                               {       17,   32,  128 },
-                               {       33,    0,    0 },
-                       },
-               },
-       };
-
-       return si_find_bin_size(sctx->screen, table, sum);
+   unsigned num_fragments = sctx->framebuffer.nr_color_samples;
+   unsigned sum = 0;
+
+   /* Compute the sum of all Bpp. */
+   for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+      if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
+         continue;
+
+      struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
+      sum += tex->surface.bpe;
+   }
+
+   /* Multiply the sum by some function of the number of samples. */
+   if (num_fragments >= 2) {
+      if (si_get_ps_iter_samples(sctx) >= 2)
+         sum *= num_fragments;
+      else
+         sum *= 2;
+   }
+
+   static const si_bin_size_subtable table[] = {
+      {
+         /* One RB / SE */
+         {
+            /* One shader engine */
+            {0, 128, 128},
+            {1, 64, 128},
+            {2, 32, 128},
+            {3, 16, 128},
+            {17, 0, 0},
+         },
+         {
+            /* Two shader engines */
+            {0, 128, 128},
+            {2, 64, 128},
+            {3, 32, 128},
+            {5, 16, 128},
+            {17, 0, 0},
+         },
+         {
+            /* Four shader engines */
+            {0, 128, 128},
+            {3, 64, 128},
+            {5, 16, 128},
+            {17, 0, 0},
+         },
+      },
+      {
+         /* Two RB / SE */
+         {
+            /* One shader engine */
+            {0, 128, 128},
+            {2, 64, 128},
+            {3, 32, 128},
+            {9, 16, 128},
+            {33, 0, 0},
+         },
+         {
+            /* Two shader engines */
+            {0, 128, 128},
+            {3, 64, 128},
+            {5, 32, 128},
+            {9, 16, 128},
+            {33, 0, 0},
+         },
+         {
+            /* Four shader engines */
+            {0, 256, 256},
+            {2, 128, 256},
+            {3, 128, 128},
+            {5, 64, 128},
+            {9, 16, 128},
+            {33, 0, 0},
+         },
+      },
+      {
+         /* Four RB / SE */
+         {
+            /* One shader engine */
+            {0, 128, 256},
+            {2, 128, 128},
+            {3, 64, 128},
+            {5, 32, 128},
+            {9, 16, 128},
+            {17, 0, 0},
+         },
+         {
+            /* Two shader engines */
+            {0, 256, 256},
+            {2, 128, 256},
+            {3, 128, 128},
+            {5, 64, 128},
+            {9, 32, 128},
+            {17, 16, 128},
+            {33, 0, 0},
+         },
+         {
+            /* Four shader engines */
+            {0, 256, 512},
+            {2, 128, 512},
+            {3, 64, 512},
+            {5, 32, 512},
+            {9, 32, 256},
+            {17, 32, 128},
+            {33, 0, 0},
+         },
+      },
+   };
+
+   return si_find_bin_size(sctx->screen, table, sum);
  }
  
  static struct uvec2 si_get_depth_bin_size(struct si_context *sctx)
  {
-       struct si_state_dsa *dsa = sctx->queued.named.dsa;
-
-       if (!sctx->framebuffer.state.zsbuf ||
-           (!dsa->depth_enabled && !dsa->stencil_enabled)) {
-               /* Return the max size. */
-               struct uvec2 size = {512, 512};
-               return size;
-       }
-
-       struct si_texture *tex =
-               (struct si_texture*)sctx->framebuffer.state.zsbuf->texture;
-       unsigned depth_coeff = dsa->depth_enabled ? 5 : 0;
-       unsigned stencil_coeff = tex->surface.has_stencil &&
-                                dsa->stencil_enabled ? 1 : 0;
-       unsigned sum = 4 * (depth_coeff + stencil_coeff) *
-                      MAX2(tex->buffer.b.b.nr_samples, 1);
-
-       static const si_bin_size_subtable table[] = {
-               {
-                       // One RB / SE
-                       {
-                               // One shader engine
-                               {        0,   64,  512 },
-                               {        2,   64,  256 },
-                               {        4,   64,  128 },
-                               {        7,   32,  128 },
-                               {       13,   16,  128 },
-                               {       49,    0,    0 },
-                       },
-                       {
-                               // Two shader engines
-                               {        0,  128,  512 },
-                               {        2,   64,  512 },
-                               {        4,   64,  256 },
-                               {        7,   64,  128 },
-                               {       13,   32,  128 },
-                               {       25,   16,  128 },
-                               {       49,    0,    0 },
-                       },
-                       {
-                               // Four shader engines
-                               {        0,  256,  512 },
-                               {        2,  128,  512 },
-                               {        4,   64,  512 },
-                               {        7,   64,  256 },
-                               {       13,   64,  128 },
-                               {       25,   16,  128 },
-                               {       49,    0,    0 },
-                       },
-               },
-               {
-                       // Two RB / SE
-                       {
-                               // One shader engine
-                               {        0,  128,  512 },
-                               {        2,   64,  512 },
-                               {        4,   64,  256 },
-                               {        7,   64,  128 },
-                               {       13,   32,  128 },
-                               {       25,   16,  128 },
-                               {       97,    0,    0 },
-                       },
-                       {
-                               // Two shader engines
-                               {        0,  256,  512 },
-                               {        2,  128,  512 },
-                               {        4,   64,  512 },
-                               {        7,   64,  256 },
-                               {       13,   64,  128 },
-                               {       25,   32,  128 },
-                               {       49,   16,  128 },
-                               {       97,    0,    0 },
-                       },
-                       {
-                               // Four shader engines
-                               {        0,  512,  512 },
-                               {        2,  256,  512 },
-                               {        4,  128,  512 },
-                               {        7,   64,  512 },
-                               {       13,   64,  256 },
-                               {       25,   64,  128 },
-                               {       49,   16,  128 },
-                               {       97,    0,    0 },
-                       },
-               },
-               {
-                       // Four RB / SE
-                       {
-                               // One shader engine
-                               {        0,  256,  512 },
-                               {        2,  128,  512 },
-                               {        4,   64,  512 },
-                               {        7,   64,  256 },
-                               {       13,   64,  128 },
-                               {       25,   32,  128 },
-                               {       49,   16,  128 },
-                               {      193,    0,    0 },
-                       },
-                       {
-                               // Two shader engines
-                               {        0,  512,  512 },
-                               {        2,  256,  512 },
-                               {        4,  128,  512 },
-                               {        7,   64,  512 },
-                               {       13,   64,  256 },
-                               {       25,   64,  128 },
-                               {       49,   32,  128 },
-                               {       97,   16,  128 },
-                               {      193,    0,    0 },
-                       },
-                       {
-                               // Four shader engines
-                               {        0,  512,  512 },
-                               {        4,  256,  512 },
-                               {        7,  128,  512 },
-                               {       13,   64,  512 },
-                               {       25,   32,  512 },
-                               {       49,   32,  256 },
-                               {       97,   16,  128 },
-                               {      193,    0,    0 },
-                       },
-               },
-       };
-
-       return si_find_bin_size(sctx->screen, table, sum);
+   struct si_state_dsa *dsa = sctx->queued.named.dsa;
+
+   if (!sctx->framebuffer.state.zsbuf || (!dsa->depth_enabled && !dsa->stencil_enabled)) {
+      /* Return the max size. */
+      struct uvec2 size = {512, 512};
+      return size;
+   }
+
+   struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
+   unsigned depth_coeff = dsa->depth_enabled ? 5 : 0;
+   unsigned stencil_coeff = tex->surface.has_stencil && dsa->stencil_enabled ? 1 : 0;
+   unsigned sum = 4 * (depth_coeff + stencil_coeff) * MAX2(tex->buffer.b.b.nr_samples, 1);
+
+   static const si_bin_size_subtable table[] = {
+      {
+         // One RB / SE
+         {
+            // One shader engine
+            {0, 64, 512},
+            {2, 64, 256},
+            {4, 64, 128},
+            {7, 32, 128},
+            {13, 16, 128},
+            {49, 0, 0},
+         },
+         {
+            // Two shader engines
+            {0, 128, 512},
+            {2, 64, 512},
+            {4, 64, 256},
+            {7, 64, 128},
+            {13, 32, 128},
+            {25, 16, 128},
+            {49, 0, 0},
+         },
+         {
+            // Four shader engines
+            {0, 256, 512},
+            {2, 128, 512},
+            {4, 64, 512},
+            {7, 64, 256},
+            {13, 64, 128},
+            {25, 16, 128},
+            {49, 0, 0},
+         },
+      },
+      {
+         // Two RB / SE
+         {
+            // One shader engine
+            {0, 128, 512},
+            {2, 64, 512},
+            {4, 64, 256},
+            {7, 64, 128},
+            {13, 32, 128},
+            {25, 16, 128},
+            {97, 0, 0},
+         },
+         {
+            // Two shader engines
+            {0, 256, 512},
+            {2, 128, 512},
+            {4, 64, 512},
+            {7, 64, 256},
+            {13, 64, 128},
+            {25, 32, 128},
+            {49, 16, 128},
+            {97, 0, 0},
+         },
+         {
+            // Four shader engines
+            {0, 512, 512},
+            {2, 256, 512},
+            {4, 128, 512},
+            {7, 64, 512},
+            {13, 64, 256},
+            {25, 64, 128},
+            {49, 16, 128},
+            {97, 0, 0},
+         },
+      },
+      {
+         // Four RB / SE
+         {
+            // One shader engine
+            {0, 256, 512},
+            {2, 128, 512},
+            {4, 64, 512},
+            {7, 64, 256},
+            {13, 64, 128},
+            {25, 32, 128},
+            {49, 16, 128},
+            {193, 0, 0},
+         },
+         {
+            // Two shader engines
+            {0, 512, 512},
+            {2, 256, 512},
+            {4, 128, 512},
+            {7, 64, 512},
+            {13, 64, 256},
+            {25, 64, 128},
+            {49, 32, 128},
+            {97, 16, 128},
+            {193, 0, 0},
+         },
+         {
+            // Four shader engines
+            {0, 512, 512},
+            {4, 256, 512},
+            {7, 128, 512},
+            {13, 64, 512},
+            {25, 32, 512},
+            {49, 32, 256},
+            {97, 16, 128},
+            {193, 0, 0},
+         },
+      },
+   };
+
+   return si_find_bin_size(sctx->screen, table, sum);
  }
  
-static void gfx10_get_bin_sizes(struct si_context *sctx,
-                               unsigned cb_target_enabled_4bit,
-                               struct uvec2 *color_bin_size,
-                               struct uvec2 *depth_bin_size)
+static void gfx10_get_bin_sizes(struct si_context *sctx, unsigned cb_target_enabled_4bit,
+                                struct uvec2 *color_bin_size, struct uvec2 *depth_bin_size)
  {
-       const unsigned ZsTagSize  = 64;
-       const unsigned ZsNumTags  = 312;
-       const unsigned CcTagSize  = 1024;
-       const unsigned CcReadTags = 31;
-       const unsigned FcTagSize  = 256;
-       const unsigned FcReadTags = 44;
-
-       const unsigned num_rbs = sctx->screen->info.num_render_backends;
-       const unsigned num_pipes = MAX2(num_rbs, sctx->screen->info.num_sdp_interfaces);
-
-       const unsigned depthBinSizeTagPart = ((ZsNumTags * num_rbs / num_pipes) * (ZsTagSize * num_pipes));
-       const unsigned colorBinSizeTagPart = ((CcReadTags * num_rbs / num_pipes) * (CcTagSize * num_pipes));
-       const unsigned fmaskBinSizeTagPart = ((FcReadTags * num_rbs / num_pipes) * (FcTagSize * num_pipes));
-
-       const unsigned minBinSizeX = 128;
-       const unsigned minBinSizeY = 64;
-
-       const unsigned num_fragments = sctx->framebuffer.nr_color_samples;
-       const unsigned num_samples = sctx->framebuffer.nr_samples;
-       const bool ps_iter_sample = si_get_ps_iter_samples(sctx) >= 2;
-
-       /* Calculate cColor and cFmask(if applicable) */
-       unsigned cColor = 0;
-       unsigned cFmask = 0;
-       bool has_fmask = false;
-
-       for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-               if (!sctx->framebuffer.state.cbufs[i])
-                       continue;
-
-               struct si_texture *tex =
-                       (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture;
-               const unsigned mmrt =
-                       num_fragments == 1 ? 1 : (ps_iter_sample ? num_fragments : 2);
-
-               cColor += tex->surface.bpe * mmrt;
-               if (num_samples >= 2 /* if FMASK is bound */) {
-                       const unsigned fragmentsLog2 = util_logbase2(num_fragments);
-                       const unsigned samplesLog2 = util_logbase2(num_samples);
-
-                       static const unsigned cFmaskMrt[4 /* fragments */][5 /* samples */] = {
-                               { 0, 1, 1, 1, 2 }, /* fragments = 1 */
-                               { 0, 1, 1, 2, 4 }, /* fragments = 2 */
-                               { 0, 1, 1, 4, 8 }, /* fragments = 4 */
-                               { 0, 1, 2, 4, 8 }  /* fragments = 8 */
-                       };
-                       cFmask += cFmaskMrt[fragmentsLog2][samplesLog2];
-                       has_fmask = true;
-               }
-       }
-       cColor = MAX2(cColor, 1u);
-
-       const unsigned colorLog2Pixels = util_logbase2(colorBinSizeTagPart / cColor);
-       const unsigned colorBinSizeX   = 1 << ((colorLog2Pixels + 1) / 2); /* round up width */
-       const unsigned colorBinSizeY   = 1 << (colorLog2Pixels / 2);       /* round down height */
-
-       unsigned binSizeX = colorBinSizeX;
-       unsigned binSizeY = colorBinSizeY;
-
-       if (has_fmask) {
-               cFmask = MAX2(cFmask, 1u);
-
-               const unsigned fmaskLog2Pixels = util_logbase2(fmaskBinSizeTagPart / cFmask);
-               const unsigned fmaskBinSizeX   = 1 << ((fmaskLog2Pixels + 1) / 2); /* round up width */
-               const unsigned fmaskBinSizeY   = 1 << (fmaskLog2Pixels / 2);       /* round down height */
-
-               /* use the smaller of the Color vs. Fmask bin sizes */
-               if (fmaskLog2Pixels < colorLog2Pixels) {
-                       binSizeX = fmaskBinSizeX;
-                       binSizeY = fmaskBinSizeY;
-               }
-       }
-
-       /* Return size adjusted for minimum bin size */
-       color_bin_size->x = MAX2(binSizeX, minBinSizeX);
-       color_bin_size->y = MAX2(binSizeY, minBinSizeY);
-
-       if (!sctx->framebuffer.state.zsbuf) {
-               /* Set to max sizes when no depth buffer is bound. */
-               depth_bin_size->x = 512;
-               depth_bin_size->y = 512;
-       } else {
-               struct si_texture *zstex = (struct si_texture*)sctx->framebuffer.state.zsbuf->texture;
-               struct si_state_dsa *dsa = sctx->queued.named.dsa;
-
-               const unsigned cPerDepthSample   = dsa->depth_enabled ? 5 : 0;
-               const unsigned cPerStencilSample = dsa->stencil_enabled ? 1 : 0;
-               const unsigned cDepth            = (cPerDepthSample + cPerStencilSample) *
-                                                  MAX2(zstex->buffer.b.b.nr_samples, 1);
-
-               const unsigned depthLog2Pixels = util_logbase2(depthBinSizeTagPart / MAX2(cDepth, 1u));
-               unsigned       depthBinSizeX   = 1 << ((depthLog2Pixels + 1) / 2);
-               unsigned       depthBinSizeY   = 1 << (depthLog2Pixels / 2);
-
-               depth_bin_size->x = MAX2(depthBinSizeX, minBinSizeX);
-               depth_bin_size->y = MAX2(depthBinSizeY, minBinSizeY);
-       }
+   const unsigned ZsTagSize = 64;
+   const unsigned ZsNumTags = 312;
+   const unsigned CcTagSize = 1024;
+   const unsigned CcReadTags = 31;
+   const unsigned FcTagSize = 256;
+   const unsigned FcReadTags = 44;
+
+   const unsigned num_rbs = sctx->screen->info.num_render_backends;
+   const unsigned num_pipes = MAX2(num_rbs, sctx->screen->info.num_sdp_interfaces);
+
+   const unsigned depthBinSizeTagPart =
+      ((ZsNumTags * num_rbs / num_pipes) * (ZsTagSize * num_pipes));
+   const unsigned colorBinSizeTagPart =
+      ((CcReadTags * num_rbs / num_pipes) * (CcTagSize * num_pipes));
+   const unsigned fmaskBinSizeTagPart =
+      ((FcReadTags * num_rbs / num_pipes) * (FcTagSize * num_pipes));
+
+   const unsigned minBinSizeX = 128;
+   const unsigned minBinSizeY = 64;
+
+   const unsigned num_fragments = sctx->framebuffer.nr_color_samples;
+   const unsigned num_samples = sctx->framebuffer.nr_samples;
+   const bool ps_iter_sample = si_get_ps_iter_samples(sctx) >= 2;
+
+   /* Calculate cColor and cFmask(if applicable) */
+   unsigned cColor = 0;
+   unsigned cFmask = 0;
+   bool has_fmask = false;
+
+   for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+      if (!sctx->framebuffer.state.cbufs[i])
+         continue;
+
+      struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
+      const unsigned mmrt = num_fragments == 1 ? 1 : (ps_iter_sample ? num_fragments : 2);
+
+      cColor += tex->surface.bpe * mmrt;
+      if (num_samples >= 2 /* if FMASK is bound */) {
+         const unsigned fragmentsLog2 = util_logbase2(num_fragments);
+         const unsigned samplesLog2 = util_logbase2(num_samples);
+
+         static const unsigned cFmaskMrt[4 /* fragments */][5 /* samples */] = {
+            {0, 1, 1, 1, 2}, /* fragments = 1 */
+            {0, 1, 1, 2, 4}, /* fragments = 2 */
+            {0, 1, 1, 4, 8}, /* fragments = 4 */
+            {0, 1, 2, 4, 8}  /* fragments = 8 */
+         };
+         cFmask += cFmaskMrt[fragmentsLog2][samplesLog2];
+         has_fmask = true;
+      }
+   }
+   cColor = MAX2(cColor, 1u);
+
+   const unsigned colorLog2Pixels = util_logbase2(colorBinSizeTagPart / cColor);
+   const unsigned colorBinSizeX = 1 << ((colorLog2Pixels + 1) / 2); /* round up width */
+   const unsigned colorBinSizeY = 1 << (colorLog2Pixels / 2);       /* round down height */
+
+   unsigned binSizeX = colorBinSizeX;
+   unsigned binSizeY = colorBinSizeY;
+
+   if (has_fmask) {
+      cFmask = MAX2(cFmask, 1u);
+
+      const unsigned fmaskLog2Pixels = util_logbase2(fmaskBinSizeTagPart / cFmask);
+      const unsigned fmaskBinSizeX = 1 << ((fmaskLog2Pixels + 1) / 2); /* round up width */
+      const unsigned fmaskBinSizeY = 1 << (fmaskLog2Pixels / 2);       /* round down height */
+
+      /* use the smaller of the Color vs. Fmask bin sizes */
+      if (fmaskLog2Pixels < colorLog2Pixels) {
+         binSizeX = fmaskBinSizeX;
+         binSizeY = fmaskBinSizeY;
+      }
+   }
+
+   /* Return size adjusted for minimum bin size */
+   color_bin_size->x = MAX2(binSizeX, minBinSizeX);
+   color_bin_size->y = MAX2(binSizeY, minBinSizeY);
+
+   if (!sctx->framebuffer.state.zsbuf) {
+      /* Set to max sizes when no depth buffer is bound. */
+      depth_bin_size->x = 512;
+      depth_bin_size->y = 512;
+   } else {
+      struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
+      struct si_state_dsa *dsa = sctx->queued.named.dsa;
+
+      const unsigned cPerDepthSample = dsa->depth_enabled ? 5 : 0;
+      const unsigned cPerStencilSample = dsa->stencil_enabled ? 1 : 0;
+      const unsigned cDepth =
+         (cPerDepthSample + cPerStencilSample) * MAX2(zstex->buffer.b.b.nr_samples, 1);
+
+      const unsigned depthLog2Pixels = util_logbase2(depthBinSizeTagPart / MAX2(cDepth, 1u));
+      unsigned depthBinSizeX = 1 << ((depthLog2Pixels + 1) / 2);
+      unsigned depthBinSizeY = 1 << (depthLog2Pixels / 2);
+
+      depth_bin_size->x = MAX2(depthBinSizeX, minBinSizeX);
+      depth_bin_size->y = MAX2(depthBinSizeY, minBinSizeY);
+   }
  }
  
  static void si_emit_dpbb_disable(struct si_context *sctx)
  {
-       unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
-       if (sctx->chip_class >= GFX10) {
-               struct uvec2 bin_size = {};
-               struct uvec2 bin_size_extend = {};
-
-               bin_size.x = 128;
-               bin_size.y = sctx->framebuffer.min_bytes_per_pixel <= 4 ? 128 : 64;
-
-               if (bin_size.x >= 32)
-                       bin_size_extend.x = util_logbase2(bin_size.x) - 5;
-               if (bin_size.y >= 32)
-                       bin_size_extend.y = util_logbase2(bin_size.y) - 5;
-
-               radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0,
-                       SI_TRACKED_PA_SC_BINNER_CNTL_0,
-                       S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) |
-                       S_028C44_BIN_SIZE_X(bin_size.x == 16) |
-                       S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
-                       S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
-                       S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
-                       S_028C44_DISABLE_START_OF_PRIM(1) |
-                       S_028C44_FLUSH_ON_BINNING_TRANSITION(sctx->last_binning_enabled != 0));
-       } else {
-               radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0,
-                       SI_TRACKED_PA_SC_BINNER_CNTL_0,
-                       S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
-                       S_028C44_DISABLE_START_OF_PRIM(1) |
-                       S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 ||
-                                                             sctx->family == CHIP_VEGA20 ||
-                                                             sctx->family >= CHIP_RAVEN2) &&
-                                                            sctx->last_binning_enabled != 0));
-       }
-
-       unsigned db_dfsm_control = sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL
-                                                            : R_028060_DB_DFSM_CONTROL;
-       radeon_opt_set_context_reg(sctx, db_dfsm_control,
-                                  SI_TRACKED_DB_DFSM_CONTROL,
-                                  S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
-                                  S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
-       if (initial_cdw != sctx->gfx_cs->current.cdw)
-               sctx->context_roll = true;
-
-       sctx->last_binning_enabled = false;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+   if (sctx->chip_class >= GFX10) {
+      struct uvec2 bin_size = {};
+      struct uvec2 bin_size_extend = {};
+
+      bin_size.x = 128;
+      bin_size.y = sctx->framebuffer.min_bytes_per_pixel <= 4 ? 128 : 64;
+
+      if (bin_size.x >= 32)
+         bin_size_extend.x = util_logbase2(bin_size.x) - 5;
+      if (bin_size.y >= 32)
+         bin_size_extend.y = util_logbase2(bin_size.y) - 5;
+
+      radeon_opt_set_context_reg(
+         sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0,
+         S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) |
+            S_028C44_BIN_SIZE_X(bin_size.x == 16) | S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
+            S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
+            S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) | S_028C44_DISABLE_START_OF_PRIM(1) |
+            S_028C44_FLUSH_ON_BINNING_TRANSITION(sctx->last_binning_enabled != 0));
+   } else {
+      radeon_opt_set_context_reg(
+         sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0,
+         S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
+            S_028C44_DISABLE_START_OF_PRIM(1) |
+            S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 ||
+                                                  sctx->family == CHIP_VEGA20 ||
+                                                  sctx->family >= CHIP_RAVEN2) &&
+                                                 sctx->last_binning_enabled != 0));
+   }
+
+   unsigned db_dfsm_control =
+      sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL : R_028060_DB_DFSM_CONTROL;
+   radeon_opt_set_context_reg(
+      sctx, db_dfsm_control, SI_TRACKED_DB_DFSM_CONTROL,
+      S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
+
+   sctx->last_binning_enabled = false;
  }
  
  void si_emit_dpbb_state(struct si_context *sctx)
  {
-       struct si_screen *sscreen = sctx->screen;
-       struct si_state_blend *blend = sctx->queued.named.blend;
-       struct si_state_dsa *dsa = sctx->queued.named.dsa;
-       unsigned db_shader_control = sctx->ps_db_shader_control;
-
-       assert(sctx->chip_class >= GFX9);
-
-       if (!sscreen->dpbb_allowed || sctx->dpbb_force_off) {
-               si_emit_dpbb_disable(sctx);
-               return;
-       }
-
-       bool ps_can_kill = G_02880C_KILL_ENABLE(db_shader_control) ||
-                          G_02880C_MASK_EXPORT_ENABLE(db_shader_control) ||
-                          G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) ||
-                          blend->alpha_to_coverage;
-
-       bool db_can_reject_z_trivially =
-               !G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
-               G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control) ||
-               G_02880C_DEPTH_BEFORE_SHADER(db_shader_control);
-
-       /* Disable DPBB when it's believed to be inefficient. */
-       if (sscreen->info.num_render_backends > 4 &&
-           ps_can_kill &&
-           db_can_reject_z_trivially &&
-           sctx->framebuffer.state.zsbuf &&
-           dsa->db_can_write) {
-               si_emit_dpbb_disable(sctx);
-               return;
-       }
-
-       /* Compute the bin size. */
-       /* TODO: We could also look at enabled pixel shader outputs. */
-       unsigned cb_target_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit &
-                                         blend->cb_target_enabled_4bit;
-       struct uvec2 color_bin_size, depth_bin_size;
-
-       if (sctx->chip_class >= GFX10) {
-               gfx10_get_bin_sizes(sctx, cb_target_enabled_4bit,
-                                   &color_bin_size, &depth_bin_size);
-       } else {
-               color_bin_size = si_get_color_bin_size(sctx, cb_target_enabled_4bit);
-               depth_bin_size = si_get_depth_bin_size(sctx);
-       }
-
-       unsigned color_area = color_bin_size.x * color_bin_size.y;
-       unsigned depth_area = depth_bin_size.x * depth_bin_size.y;
-
-       struct uvec2 bin_size = color_area < depth_area ? color_bin_size
-                                                       : depth_bin_size;
-
-       if (!bin_size.x || !bin_size.y) {
-               si_emit_dpbb_disable(sctx);
-               return;
-       }
-
-       /* Enable DFSM if it's preferred. */
-       unsigned punchout_mode = V_028060_FORCE_OFF;
-       bool disable_start_of_prim = true;
-       bool zs_eqaa_dfsm_bug = sctx->chip_class == GFX9 &&
-                               sctx->framebuffer.state.zsbuf &&
-                               sctx->framebuffer.nr_samples !=
-                               MAX2(1, sctx->framebuffer.state.zsbuf->texture->nr_samples);
-
-       if (sscreen->dfsm_allowed &&
-           !zs_eqaa_dfsm_bug &&
-           cb_target_enabled_4bit &&
-           !G_02880C_KILL_ENABLE(db_shader_control) &&
-           /* These two also imply that DFSM is disabled when PS writes to memory. */
-           !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control) &&
-           !G_02880C_EXEC_ON_NOOP(db_shader_control) &&
-           G_02880C_Z_ORDER(db_shader_control) == V_02880C_EARLY_Z_THEN_LATE_Z) {
-               punchout_mode = V_028060_AUTO;
-               disable_start_of_prim = (cb_target_enabled_4bit &
-                                        blend->blend_enable_4bit) != 0;
-       }
-
-       /* Tunable parameters. Also test with DFSM enabled/disabled. */
-       unsigned context_states_per_bin; /* allowed range: [1, 6] */
-       unsigned persistent_states_per_bin; /* allowed range: [1, 32] */
-       unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */
-
-       /* Tuned for Raven. Vega might need different values. */
-       if (sscreen->info.has_dedicated_vram) {
-               if (sscreen->info.num_render_backends > 4) {
-                       context_states_per_bin = 1;
-                       persistent_states_per_bin = 1;
-               } else {
-                       context_states_per_bin = 3;
-                       persistent_states_per_bin = 8;
-               }
-       } else {
-               /* This is a workaround for:
-                *    https://bugs.freedesktop.org/show_bug.cgi?id=110214
-                * (an alternative is to insert manual BATCH_BREAK event when
-                * a context_roll is detected). */
-               context_states_per_bin = sctx->screen->info.has_gfx9_scissor_bug ? 1 : 6;
-               /* Using 32 here can cause GPU hangs on RAVEN1 */
-               persistent_states_per_bin = 16;
-       }
-       fpovs_per_batch = 63;
-
-       /* Emit registers. */
-       struct uvec2 bin_size_extend = {};
-       if (bin_size.x >= 32)
-               bin_size_extend.x = util_logbase2(bin_size.x) - 5;
-       if (bin_size.y >= 32)
-               bin_size_extend.y = util_logbase2(bin_size.y) - 5;
-
-       unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-       radeon_opt_set_context_reg(
-               sctx, R_028C44_PA_SC_BINNER_CNTL_0,
-               SI_TRACKED_PA_SC_BINNER_CNTL_0,
-               S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
-               S_028C44_BIN_SIZE_X(bin_size.x == 16) |
-               S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
-               S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
-               S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
-               S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin - 1) |
-               S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin - 1) |
-               S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
-               S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
-               S_028C44_OPTIMAL_BIN_SELECTION(1) |
-               S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 ||
-                                                     sctx->family == CHIP_VEGA20 ||
-                                                     sctx->family >= CHIP_RAVEN2) &&
-                                                    sctx->last_binning_enabled != 1));
-
-       unsigned db_dfsm_control = sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL
-                                                            : R_028060_DB_DFSM_CONTROL;
-       radeon_opt_set_context_reg(sctx, db_dfsm_control,
-                                  SI_TRACKED_DB_DFSM_CONTROL,
-                                  S_028060_PUNCHOUT_MODE(punchout_mode) |
-                                  S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
-       if (initial_cdw != sctx->gfx_cs->current.cdw)
-               sctx->context_roll = true;
-
-       sctx->last_binning_enabled = true;
+   struct si_screen *sscreen = sctx->screen;
+   struct si_state_blend *blend = sctx->queued.named.blend;
+   struct si_state_dsa *dsa = sctx->queued.named.dsa;
+   unsigned db_shader_control = sctx->ps_db_shader_control;
+
+   assert(sctx->chip_class >= GFX9);
+
+   if (!sscreen->dpbb_allowed || sctx->dpbb_force_off) {
+      si_emit_dpbb_disable(sctx);
+      return;
+   }
+
+   bool ps_can_kill =
+      G_02880C_KILL_ENABLE(db_shader_control) || G_02880C_MASK_EXPORT_ENABLE(db_shader_control) ||
+      G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) || blend->alpha_to_coverage;
+
+   bool db_can_reject_z_trivially = !G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
+                                    G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control) ||
+                                    G_02880C_DEPTH_BEFORE_SHADER(db_shader_control);
+
+   /* Disable DPBB when it's believed to be inefficient. */
+   if (sscreen->info.num_render_backends > 4 && ps_can_kill && db_can_reject_z_trivially &&
+       sctx->framebuffer.state.zsbuf && dsa->db_can_write) {
+      si_emit_dpbb_disable(sctx);
+      return;
+   }
+
+   /* Compute the bin size. */
+   /* TODO: We could also look at enabled pixel shader outputs. */
+   unsigned cb_target_enabled_4bit =
+      sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit;
+   struct uvec2 color_bin_size, depth_bin_size;
+
+   if (sctx->chip_class >= GFX10) {
+      gfx10_get_bin_sizes(sctx, cb_target_enabled_4bit, &color_bin_size, &depth_bin_size);
+   } else {
+      color_bin_size = si_get_color_bin_size(sctx, cb_target_enabled_4bit);
+      depth_bin_size = si_get_depth_bin_size(sctx);
+   }
+
+   unsigned color_area = color_bin_size.x * color_bin_size.y;
+   unsigned depth_area = depth_bin_size.x * depth_bin_size.y;
+
+   struct uvec2 bin_size = color_area < depth_area ? color_bin_size : depth_bin_size;
+
+   if (!bin_size.x || !bin_size.y) {
+      si_emit_dpbb_disable(sctx);
+      return;
+   }
+
+   /* Enable DFSM if it's preferred. */
+   unsigned punchout_mode = V_028060_FORCE_OFF;
+   bool disable_start_of_prim = true;
+   bool zs_eqaa_dfsm_bug =
+      sctx->chip_class == GFX9 && sctx->framebuffer.state.zsbuf &&
+      sctx->framebuffer.nr_samples != MAX2(1, sctx->framebuffer.state.zsbuf->texture->nr_samples);
+
+   if (sscreen->dfsm_allowed && !zs_eqaa_dfsm_bug && cb_target_enabled_4bit &&
+       !G_02880C_KILL_ENABLE(db_shader_control) &&
+       /* These two also imply that DFSM is disabled when PS writes to memory. */
+       !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control) &&
+       !G_02880C_EXEC_ON_NOOP(db_shader_control) &&
+       G_02880C_Z_ORDER(db_shader_control) == V_02880C_EARLY_Z_THEN_LATE_Z) {
+      punchout_mode = V_028060_AUTO;
+      disable_start_of_prim = (cb_target_enabled_4bit & blend->blend_enable_4bit) != 0;
+   }
+
+   /* Tunable parameters. Also test with DFSM enabled/disabled. */
+   unsigned context_states_per_bin;    /* allowed range: [1, 6] */
+   unsigned persistent_states_per_bin; /* allowed range: [1, 32] */
+   unsigned fpovs_per_batch;           /* allowed range: [0, 255], 0 = unlimited */
+
+   /* Tuned for Raven. Vega might need different values. */
+   if (sscreen->info.has_dedicated_vram) {
+      if (sscreen->info.num_render_backends > 4) {
+         context_states_per_bin = 1;
+         persistent_states_per_bin = 1;
+      } else {
+         context_states_per_bin = 3;
+         persistent_states_per_bin = 8;
+      }
+   } else {
+      /* This is a workaround for:
+       *    https://bugs.freedesktop.org/show_bug.cgi?id=110214
+       * (an alternative is to insert manual BATCH_BREAK event when
+       * a context_roll is detected). */
+      context_states_per_bin = sctx->screen->info.has_gfx9_scissor_bug ? 1 : 6;
+      /* Using 32 here can cause GPU hangs on RAVEN1 */
+      persistent_states_per_bin = 16;
+   }
+   fpovs_per_batch = 63;
+
+   /* Emit registers. */
+   struct uvec2 bin_size_extend = {};
+   if (bin_size.x >= 32)
+      bin_size_extend.x = util_logbase2(bin_size.x) - 5;
+   if (bin_size.y >= 32)
+      bin_size_extend.y = util_logbase2(bin_size.y) - 5;
+
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   radeon_opt_set_context_reg(
+      sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0,
+      S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | S_028C44_BIN_SIZE_X(bin_size.x == 16) |
+         S_028C44_BIN_SIZE_Y(bin_size.y == 16) | S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
+         S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
+         S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin - 1) |
+         S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin - 1) |
+         S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
+         S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) | S_028C44_OPTIMAL_BIN_SELECTION(1) |
+         S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 ||
+                                               sctx->family == CHIP_VEGA20 ||
+                                               sctx->family >= CHIP_RAVEN2) &&
+                                              sctx->last_binning_enabled != 1));
+
+   unsigned db_dfsm_control =
+      sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL : R_028060_DB_DFSM_CONTROL;
+   radeon_opt_set_context_reg(
+      sctx, db_dfsm_control, SI_TRACKED_DB_DFSM_CONTROL,
+      S_028060_PUNCHOUT_MODE(punchout_mode) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
+
+   sctx->last_binning_enabled = true;
  }
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c

index dc6de604d21ee4078499a4f829f44c0869255d8b..7def05440e1d8dc791decbc9b757df8d561060ed 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -22,42 +22,39 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
+#include "ac_debug.h"
  #include "si_build_pm4.h"
  #include "sid.h"
-
  #include "util/u_index_modify.h"
  #include "util/u_log.h"
-#include "util/u_upload_mgr.h"
  #include "util/u_prim.h"
  #include "util/u_suballoc.h"
-
-#include "ac_debug.h"
+#include "util/u_upload_mgr.h"
  
  /* special primitive types */
-#define SI_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX
+#define SI_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX
  
  static unsigned si_conv_pipe_prim(unsigned mode)
  {
-        static const unsigned prim_conv[] = {
-               [PIPE_PRIM_POINTS]                      = V_008958_DI_PT_POINTLIST,
-               [PIPE_PRIM_LINES]                       = V_008958_DI_PT_LINELIST,
-               [PIPE_PRIM_LINE_LOOP]                   = V_008958_DI_PT_LINELOOP,
-               [PIPE_PRIM_LINE_STRIP]                  = V_008958_DI_PT_LINESTRIP,
-               [PIPE_PRIM_TRIANGLES]                   = V_008958_DI_PT_TRILIST,
-               [PIPE_PRIM_TRIANGLE_STRIP]              = V_008958_DI_PT_TRISTRIP,
-               [PIPE_PRIM_TRIANGLE_FAN]                = V_008958_DI_PT_TRIFAN,
-               [PIPE_PRIM_QUADS]                       = V_008958_DI_PT_QUADLIST,
-               [PIPE_PRIM_QUAD_STRIP]                  = V_008958_DI_PT_QUADSTRIP,
-               [PIPE_PRIM_POLYGON]                     = V_008958_DI_PT_POLYGON,
-               [PIPE_PRIM_LINES_ADJACENCY]             = V_008958_DI_PT_LINELIST_ADJ,
-               [PIPE_PRIM_LINE_STRIP_ADJACENCY]        = V_008958_DI_PT_LINESTRIP_ADJ,
-               [PIPE_PRIM_TRIANGLES_ADJACENCY]         = V_008958_DI_PT_TRILIST_ADJ,
-               [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]    = V_008958_DI_PT_TRISTRIP_ADJ,
-               [PIPE_PRIM_PATCHES]                     = V_008958_DI_PT_PATCH,
-               [SI_PRIM_RECTANGLE_LIST]                = V_008958_DI_PT_RECTLIST
-        };
-       assert(mode < ARRAY_SIZE(prim_conv));
-       return prim_conv[mode];
+   static const unsigned prim_conv[] = {
+      [PIPE_PRIM_POINTS] = V_008958_DI_PT_POINTLIST,
+      [PIPE_PRIM_LINES] = V_008958_DI_PT_LINELIST,
+      [PIPE_PRIM_LINE_LOOP] = V_008958_DI_PT_LINELOOP,
+      [PIPE_PRIM_LINE_STRIP] = V_008958_DI_PT_LINESTRIP,
+      [PIPE_PRIM_TRIANGLES] = V_008958_DI_PT_TRILIST,
+      [PIPE_PRIM_TRIANGLE_STRIP] = V_008958_DI_PT_TRISTRIP,
+      [PIPE_PRIM_TRIANGLE_FAN] = V_008958_DI_PT_TRIFAN,
+      [PIPE_PRIM_QUADS] = V_008958_DI_PT_QUADLIST,
+      [PIPE_PRIM_QUAD_STRIP] = V_008958_DI_PT_QUADSTRIP,
+      [PIPE_PRIM_POLYGON] = V_008958_DI_PT_POLYGON,
+      [PIPE_PRIM_LINES_ADJACENCY] = V_008958_DI_PT_LINELIST_ADJ,
+      [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_008958_DI_PT_LINESTRIP_ADJ,
+      [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_008958_DI_PT_TRILIST_ADJ,
+      [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_008958_DI_PT_TRISTRIP_ADJ,
+      [PIPE_PRIM_PATCHES] = V_008958_DI_PT_PATCH,
+      [SI_PRIM_RECTANGLE_LIST] = V_008958_DI_PT_RECTLIST};
+   assert(mode < ARRAY_SIZE(prim_conv));
+   return prim_conv[mode];
  }
  
  /**
@@ -67,652 +64,597 @@ static unsigned si_conv_pipe_prim(unsigned mode)
   * The information about LDS and other non-compile-time parameters is then
   * written to userdata SGPRs.
   */
-static void si_emit_derived_tess_state(struct si_context *sctx,
-                                      const struct pipe_draw_info *info,
-                                      unsigned *num_patches)
+static void si_emit_derived_tess_state(struct si_context *sctx, const struct pipe_draw_info *info,
+                                       unsigned *num_patches)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       struct si_shader *ls_current;
-       struct si_shader_selector *ls;
-       /* The TES pointer will only be used for sctx->last_tcs.
-        * It would be wrong to think that TCS = TES. */
-       struct si_shader_selector *tcs =
-               sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;
-       unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
-       bool has_primid_instancing_bug = sctx->chip_class == GFX6 &&
-                                        sctx->screen->info.max_se == 1;
-       unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
-       unsigned num_tcs_input_cp = info->vertices_per_patch;
-       unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
-       unsigned num_tcs_patch_outputs;
-       unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
-       unsigned input_patch_size, output_patch_size, output_patch0_offset;
-       unsigned perpatch_output_offset, lds_size;
-       unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
-       unsigned offchip_layout, hardware_lds_size, ls_hs_config;
-
-       /* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */
-       if (sctx->chip_class >= GFX9) {
-               if (sctx->tcs_shader.cso)
-                       ls_current = sctx->tcs_shader.current;
-               else
-                       ls_current = sctx->fixed_func_tcs_shader.current;
-
-               ls = ls_current->key.part.tcs.ls;
-       } else {
-               ls_current = sctx->vs_shader.current;
-               ls = sctx->vs_shader.cso;
-       }
-
-       if (sctx->last_ls == ls_current &&
-           sctx->last_tcs == tcs &&
-           sctx->last_tes_sh_base == tes_sh_base &&
-           sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
-           (!has_primid_instancing_bug ||
-            (sctx->last_tess_uses_primid == tess_uses_primid))) {
-               *num_patches = sctx->last_num_patches;
-               return;
-       }
-
-       sctx->last_ls = ls_current;
-       sctx->last_tcs = tcs;
-       sctx->last_tes_sh_base = tes_sh_base;
-       sctx->last_num_tcs_input_cp = num_tcs_input_cp;
-       sctx->last_tess_uses_primid = tess_uses_primid;
-
-       /* This calculates how shader inputs and outputs among VS, TCS, and TES
-        * are laid out in LDS. */
-       num_tcs_inputs = util_last_bit64(ls->outputs_written);
-
-       if (sctx->tcs_shader.cso) {
-               num_tcs_outputs = util_last_bit64(tcs->outputs_written);
-               num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
-               num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
-       } else {
-               /* No TCS. Route varyings from LS to TES. */
-               num_tcs_outputs = num_tcs_inputs;
-               num_tcs_output_cp = num_tcs_input_cp;
-               num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
-       }
-
-       input_vertex_size = ls->lshs_vertex_stride;
-       output_vertex_size = num_tcs_outputs * 16;
-
-       input_patch_size = num_tcs_input_cp * input_vertex_size;
-
-       pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
-       output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
-
-       /* Ensure that we only need one wave per SIMD so we don't need to check
-        * resource usage. Also ensures that the number of tcs in and out
-        * vertices per threadgroup are at most 256.
-        */
-       unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
-       *num_patches = 256 / max_verts_per_patch;
-
-       /* Make sure that the data fits in LDS. This assumes the shaders only
-        * use LDS for the inputs and outputs.
-        *
-        * While GFX7 can use 64K per threadgroup, there is a hang on Stoney
-        * with 2 CUs if we use more than 32K. The closed Vulkan driver also
-        * uses 32K at most on all GCN chips.
-        */
-       hardware_lds_size = 32768;
-       *num_patches = MIN2(*num_patches, hardware_lds_size / (input_patch_size +
-                                                              output_patch_size));
-
-       /* Make sure the output data fits in the offchip buffer */
-       *num_patches = MIN2(*num_patches,
-                           (sctx->screen->tess_offchip_block_dw_size * 4) /
-                           output_patch_size);
-
-       /* Not necessary for correctness, but improves performance.
-        * The hardware can do more, but the radeonsi shader constant is
-        * limited to 6 bits.
-        */
-       *num_patches = MIN2(*num_patches, 63); /* triangles: 3 full waves except 3 lanes */
-
-       /* When distributed tessellation is unsupported, switch between SEs
-        * at a higher frequency to compensate for it.
-        */
-       if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1)
-               *num_patches = MIN2(*num_patches, 16); /* recommended */
-
-       /* Make sure that vector lanes are reasonably occupied. It probably
-        * doesn't matter much because this is LS-HS, and TES is likely to
-        * occupy significantly more CUs.
-        */
-       unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch;
-       unsigned wave_size = sctx->screen->ge_wave_size;
-
-       if (temp_verts_per_tg > wave_size && temp_verts_per_tg % wave_size < wave_size*3/4)
-               *num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;
-
-       if (sctx->chip_class == GFX6) {
-               /* GFX6 bug workaround, related to power management. Limit LS-HS
-                * threadgroups to only one wave.
-                */
-               unsigned one_wave = wave_size / max_verts_per_patch;
-               *num_patches = MIN2(*num_patches, one_wave);
-       }
-
-       /* The VGT HS block increments the patch ID unconditionally
-        * within a single threadgroup. This results in incorrect
-        * patch IDs when instanced draws are used.
-        *
-        * The intended solution is to restrict threadgroups to
-        * a single instance by setting SWITCH_ON_EOI, which
-        * should cause IA to split instances up. However, this
-        * doesn't work correctly on GFX6 when there is no other
-        * SE to switch to.
-        */
-       if (has_primid_instancing_bug && tess_uses_primid)
-               *num_patches = 1;
-
-       sctx->last_num_patches = *num_patches;
-
-       output_patch0_offset = input_patch_size * *num_patches;
-       perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
-
-       /* Compute userdata SGPRs. */
-       assert(((input_vertex_size / 4) & ~0xff) == 0);
-       assert(((output_vertex_size / 4) & ~0xff) == 0);
-       assert(((input_patch_size / 4) & ~0x1fff) == 0);
-       assert(((output_patch_size / 4) & ~0x1fff) == 0);
-       assert(((output_patch0_offset / 16) & ~0xffff) == 0);
-       assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
-       assert(num_tcs_input_cp <= 32);
-       assert(num_tcs_output_cp <= 32);
-
-       uint64_t ring_va = si_resource(sctx->tess_rings)->gpu_address;
-       assert((ring_va & u_bit_consecutive(0, 19)) == 0);
-
-       tcs_in_layout = S_VS_STATE_LS_OUT_PATCH_SIZE(input_patch_size / 4) |
-                       S_VS_STATE_LS_OUT_VERTEX_SIZE(input_vertex_size / 4);
-       tcs_out_layout = (output_patch_size / 4) |
-                        (num_tcs_input_cp << 13) |
-                        ring_va;
-       tcs_out_offsets = (output_patch0_offset / 16) |
-                         ((perpatch_output_offset / 16) << 16);
-       offchip_layout = *num_patches |
-                        (num_tcs_output_cp << 6) |
-                        (pervertex_output_patch_size * *num_patches << 12);
-
-       /* Compute the LDS size. */
-       lds_size = output_patch0_offset + output_patch_size * *num_patches;
-
-       if (sctx->chip_class >= GFX7) {
-               assert(lds_size <= 65536);
-               lds_size = align(lds_size, 512) / 512;
-       } else {
-               assert(lds_size <= 32768);
-               lds_size = align(lds_size, 256) / 256;
-       }
-
-       /* Set SI_SGPR_VS_STATE_BITS. */
-       sctx->current_vs_state &= C_VS_STATE_LS_OUT_PATCH_SIZE &
-                                 C_VS_STATE_LS_OUT_VERTEX_SIZE;
-       sctx->current_vs_state |= tcs_in_layout;
-
-       /* We should be able to support in-shader LDS use with LLVM >= 9
-        * by just adding the lds_sizes together, but it has never
-        * been tested. */
-       assert(ls_current->config.lds_size == 0);
-
-       if (sctx->chip_class >= GFX9) {
-               unsigned hs_rsrc2 = ls_current->config.rsrc2;
-
-               if (sctx->chip_class >= GFX10)
-                       hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size);
-               else
-                       hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size);
-
-               radeon_set_sh_reg(cs, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2);
-
-               /* Set userdata SGPRs for merged LS-HS. */
-               radeon_set_sh_reg_seq(cs,
-                                     R_00B430_SPI_SHADER_USER_DATA_LS_0 +
-                                     GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3);
-               radeon_emit(cs, offchip_layout);
-               radeon_emit(cs, tcs_out_offsets);
-               radeon_emit(cs, tcs_out_layout);
-       } else {
-               unsigned ls_rsrc2 = ls_current->config.rsrc2;
-
-               si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
-               ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
-
-               /* Due to a hw bug, RSRC2_LS must be written twice with another
-                * LS register written in between. */
-               if (sctx->chip_class == GFX7 && sctx->family != CHIP_HAWAII)
-                       radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
-               radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-               radeon_emit(cs, ls_current->config.rsrc1);
-               radeon_emit(cs, ls_rsrc2);
-
-               /* Set userdata SGPRs for TCS. */
-               radeon_set_sh_reg_seq(cs,
-                       R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
-               radeon_emit(cs, offchip_layout);
-               radeon_emit(cs, tcs_out_offsets);
-               radeon_emit(cs, tcs_out_layout);
-               radeon_emit(cs, tcs_in_layout);
-       }
-
-       /* Set userdata SGPRs for TES. */
-       radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);
-       radeon_emit(cs, offchip_layout);
-       radeon_emit(cs, ring_va);
-
-       ls_hs_config = S_028B58_NUM_PATCHES(*num_patches) |
-                      S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
-                      S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
-
-       if (sctx->last_ls_hs_config != ls_hs_config) {
-               if (sctx->chip_class >= GFX7) {
-                       radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2,
-                                                  ls_hs_config);
-               } else {
-                       radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG,
-                                              ls_hs_config);
-               }
-               sctx->last_ls_hs_config = ls_hs_config;
-               sctx->context_roll = true;
-       }
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct si_shader *ls_current;
+   struct si_shader_selector *ls;
+   /* The TES pointer will only be used for sctx->last_tcs.
+    * It would be wrong to think that TCS = TES. */
+   struct si_shader_selector *tcs =
+      sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;
+   unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
+   bool has_primid_instancing_bug = sctx->chip_class == GFX6 && sctx->screen->info.max_se == 1;
+   unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
+   unsigned num_tcs_input_cp = info->vertices_per_patch;
+   unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
+   unsigned num_tcs_patch_outputs;
+   unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
+   unsigned input_patch_size, output_patch_size, output_patch0_offset;
+   unsigned perpatch_output_offset, lds_size;
+   unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
+   unsigned offchip_layout, hardware_lds_size, ls_hs_config;
+
+   /* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */
+   if (sctx->chip_class >= GFX9) {
+      if (sctx->tcs_shader.cso)
+         ls_current = sctx->tcs_shader.current;
+      else
+         ls_current = sctx->fixed_func_tcs_shader.current;
+
+      ls = ls_current->key.part.tcs.ls;
+   } else {
+      ls_current = sctx->vs_shader.current;
+      ls = sctx->vs_shader.cso;
+   }
+
+   if (sctx->last_ls == ls_current && sctx->last_tcs == tcs &&
+       sctx->last_tes_sh_base == tes_sh_base && sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
+       (!has_primid_instancing_bug || (sctx->last_tess_uses_primid == tess_uses_primid))) {
+      *num_patches = sctx->last_num_patches;
+      return;
+   }
+
+   sctx->last_ls = ls_current;
+   sctx->last_tcs = tcs;
+   sctx->last_tes_sh_base = tes_sh_base;
+   sctx->last_num_tcs_input_cp = num_tcs_input_cp;
+   sctx->last_tess_uses_primid = tess_uses_primid;
+
+   /* This calculates how shader inputs and outputs among VS, TCS, and TES
+    * are laid out in LDS. */
+   num_tcs_inputs = util_last_bit64(ls->outputs_written);
+
+   if (sctx->tcs_shader.cso) {
+      num_tcs_outputs = util_last_bit64(tcs->outputs_written);
+      num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+      num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
+   } else {
+      /* No TCS. Route varyings from LS to TES. */
+      num_tcs_outputs = num_tcs_inputs;
+      num_tcs_output_cp = num_tcs_input_cp;
+      num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
+   }
+
+   input_vertex_size = ls->lshs_vertex_stride;
+   output_vertex_size = num_tcs_outputs * 16;
+
+   input_patch_size = num_tcs_input_cp * input_vertex_size;
+
+   pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
+   output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+
+   /* Ensure that we only need one wave per SIMD so we don't need to check
+    * resource usage. Also ensures that the number of tcs in and out
+    * vertices per threadgroup are at most 256.
+    */
+   unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
+   *num_patches = 256 / max_verts_per_patch;
+
+   /* Make sure that the data fits in LDS. This assumes the shaders only
+    * use LDS for the inputs and outputs.
+    *
+    * While GFX7 can use 64K per threadgroup, there is a hang on Stoney
+    * with 2 CUs if we use more than 32K. The closed Vulkan driver also
+    * uses 32K at most on all GCN chips.
+    */
+   hardware_lds_size = 32768;
+   *num_patches = MIN2(*num_patches, hardware_lds_size / (input_patch_size + output_patch_size));
+
+   /* Make sure the output data fits in the offchip buffer */
+   *num_patches =
+      MIN2(*num_patches, (sctx->screen->tess_offchip_block_dw_size * 4) / output_patch_size);
+
+   /* Not necessary for correctness, but improves performance.
+    * The hardware can do more, but the radeonsi shader constant is
+    * limited to 6 bits.
+    */
+   *num_patches = MIN2(*num_patches, 63); /* triangles: 3 full waves except 3 lanes */
+
+   /* When distributed tessellation is unsupported, switch between SEs
+    * at a higher frequency to compensate for it.
+    */
+   if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1)
+      *num_patches = MIN2(*num_patches, 16); /* recommended */
+
+   /* Make sure that vector lanes are reasonably occupied. It probably
+    * doesn't matter much because this is LS-HS, and TES is likely to
+    * occupy significantly more CUs.
+    */
+   unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch;
+   unsigned wave_size = sctx->screen->ge_wave_size;
+
+   if (temp_verts_per_tg > wave_size && temp_verts_per_tg % wave_size < wave_size * 3 / 4)
+      *num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;
+
+   if (sctx->chip_class == GFX6) {
+      /* GFX6 bug workaround, related to power management. Limit LS-HS
+       * threadgroups to only one wave.
+       */
+      unsigned one_wave = wave_size / max_verts_per_patch;
+      *num_patches = MIN2(*num_patches, one_wave);
+   }
+
+   /* The VGT HS block increments the patch ID unconditionally
+    * within a single threadgroup. This results in incorrect
+    * patch IDs when instanced draws are used.
+    *
+    * The intended solution is to restrict threadgroups to
+    * a single instance by setting SWITCH_ON_EOI, which
+    * should cause IA to split instances up. However, this
+    * doesn't work correctly on GFX6 when there is no other
+    * SE to switch to.
+    */
+   if (has_primid_instancing_bug && tess_uses_primid)
+      *num_patches = 1;
+
+   sctx->last_num_patches = *num_patches;
+
+   output_patch0_offset = input_patch_size * *num_patches;
+   perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
+
+   /* Compute userdata SGPRs. */
+   assert(((input_vertex_size / 4) & ~0xff) == 0);
+   assert(((output_vertex_size / 4) & ~0xff) == 0);
+   assert(((input_patch_size / 4) & ~0x1fff) == 0);
+   assert(((output_patch_size / 4) & ~0x1fff) == 0);
+   assert(((output_patch0_offset / 16) & ~0xffff) == 0);
+   assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
+   assert(num_tcs_input_cp <= 32);
+   assert(num_tcs_output_cp <= 32);
+
+   uint64_t ring_va = si_resource(sctx->tess_rings)->gpu_address;
+   assert((ring_va & u_bit_consecutive(0, 19)) == 0);
+
+   tcs_in_layout = S_VS_STATE_LS_OUT_PATCH_SIZE(input_patch_size / 4) |
+                   S_VS_STATE_LS_OUT_VERTEX_SIZE(input_vertex_size / 4);
+   tcs_out_layout = (output_patch_size / 4) | (num_tcs_input_cp << 13) | ring_va;
+   tcs_out_offsets = (output_patch0_offset / 16) | ((perpatch_output_offset / 16) << 16);
+   offchip_layout =
+      *num_patches | (num_tcs_output_cp << 6) | (pervertex_output_patch_size * *num_patches << 12);
+
+   /* Compute the LDS size. */
+   lds_size = output_patch0_offset + output_patch_size * *num_patches;
+
+   if (sctx->chip_class >= GFX7) {
+      assert(lds_size <= 65536);
+      lds_size = align(lds_size, 512) / 512;
+   } else {
+      assert(lds_size <= 32768);
+      lds_size = align(lds_size, 256) / 256;
+   }
+
+   /* Set SI_SGPR_VS_STATE_BITS. */
+   sctx->current_vs_state &= C_VS_STATE_LS_OUT_PATCH_SIZE & C_VS_STATE_LS_OUT_VERTEX_SIZE;
+   sctx->current_vs_state |= tcs_in_layout;
+
+   /* We should be able to support in-shader LDS use with LLVM >= 9
+    * by just adding the lds_sizes together, but it has never
+    * been tested. */
+   assert(ls_current->config.lds_size == 0);
+
+   if (sctx->chip_class >= GFX9) {
+      unsigned hs_rsrc2 = ls_current->config.rsrc2;
+
+      if (sctx->chip_class >= GFX10)
+         hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size);
+      else
+         hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size);
+
+      radeon_set_sh_reg(cs, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2);
+
+      /* Set userdata SGPRs for merged LS-HS. */
+      radeon_set_sh_reg_seq(
+         cs, R_00B430_SPI_SHADER_USER_DATA_LS_0 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3);
+      radeon_emit(cs, offchip_layout);
+      radeon_emit(cs, tcs_out_offsets);
+      radeon_emit(cs, tcs_out_layout);
+   } else {
+      unsigned ls_rsrc2 = ls_current->config.rsrc2;
+
+      si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
+      ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
+
+      /* Due to a hw bug, RSRC2_LS must be written twice with another
+       * LS register written in between. */
+      if (sctx->chip_class == GFX7 && sctx->family != CHIP_HAWAII)
+         radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
+      radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
+      radeon_emit(cs, ls_current->config.rsrc1);
+      radeon_emit(cs, ls_rsrc2);
+
+      /* Set userdata SGPRs for TCS. */
+      radeon_set_sh_reg_seq(
+         cs, R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
+      radeon_emit(cs, offchip_layout);
+      radeon_emit(cs, tcs_out_offsets);
+      radeon_emit(cs, tcs_out_layout);
+      radeon_emit(cs, tcs_in_layout);
+   }
+
+   /* Set userdata SGPRs for TES. */
+   radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);
+   radeon_emit(cs, offchip_layout);
+   radeon_emit(cs, ring_va);
+
+   ls_hs_config = S_028B58_NUM_PATCHES(*num_patches) | S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
+                  S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
+
+   if (sctx->last_ls_hs_config != ls_hs_config) {
+      if (sctx->chip_class >= GFX7) {
+         radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
+      } else {
+         radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
+      }
+      sctx->last_ls_hs_config = ls_hs_config;
+      sctx->context_roll = true;
+   }
  }
  
  static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info,
-                                         enum pipe_prim_type prim)
+                                          enum pipe_prim_type prim)
  {
-       switch (prim) {
-       case PIPE_PRIM_PATCHES:
-               return info->count / info->vertices_per_patch;
-       case PIPE_PRIM_POLYGON:
-               return info->count >= 3;
-       case SI_PRIM_RECTANGLE_LIST:
-               return info->count / 3;
-       default:
-               return u_decomposed_prims_for_vertices(prim, info->count);
-       }
+   switch (prim) {
+   case PIPE_PRIM_PATCHES:
+      return info->count / info->vertices_per_patch;
+   case PIPE_PRIM_POLYGON:
+      return info->count >= 3;
+   case SI_PRIM_RECTANGLE_LIST:
+      return info->count / 3;
+   default:
+      return u_decomposed_prims_for_vertices(prim, info->count);
+   }
  }
  
-static unsigned
-si_get_init_multi_vgt_param(struct si_screen *sscreen,
-                           union si_vgt_param_key *key)
+static unsigned si_get_init_multi_vgt_param(struct si_screen *sscreen, union si_vgt_param_key *key)
  {
-       STATIC_ASSERT(sizeof(union si_vgt_param_key) == 4);
-       unsigned max_primgroup_in_wave = 2;
-
-       /* SWITCH_ON_EOP(0) is always preferable. */
-       bool wd_switch_on_eop = false;
-       bool ia_switch_on_eop = false;
-       bool ia_switch_on_eoi = false;
-       bool partial_vs_wave = false;
-       bool partial_es_wave = false;
-
-       if (key->u.uses_tess) {
-               /* SWITCH_ON_EOI must be set if PrimID is used. */
-               if (key->u.tess_uses_prim_id)
-                       ia_switch_on_eoi = true;
-
-               /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
-               if ((sscreen->info.family == CHIP_TAHITI ||
-                    sscreen->info.family == CHIP_PITCAIRN ||
-                    sscreen->info.family == CHIP_BONAIRE) &&
-                   key->u.uses_gs)
-                       partial_vs_wave = true;
-
-               /* Needed for 028B6C_DISTRIBUTION_MODE != 0. (implies >= GFX8) */
-               if (sscreen->info.has_distributed_tess) {
-                       if (key->u.uses_gs) {
-                               if (sscreen->info.chip_class == GFX8)
-                                       partial_es_wave = true;
-                       } else {
-                               partial_vs_wave = true;
-                       }
-               }
-       }
-
-       /* This is a hardware requirement. */
-       if (key->u.line_stipple_enabled ||
-           (sscreen->debug_flags & DBG(SWITCH_ON_EOP))) {
-               ia_switch_on_eop = true;
-               wd_switch_on_eop = true;
-       }
-
-       if (sscreen->info.chip_class >= GFX7) {
-               /* WD_SWITCH_ON_EOP has no effect on GPUs with less than
-                * 4 shader engines. Set 1 to pass the assertion below.
-                * The other cases are hardware requirements.
-                *
-                * Polaris supports primitive restart with WD_SWITCH_ON_EOP=0
-                * for points, line strips, and tri strips.
-                */
-               if (sscreen->info.max_se <= 2 ||
-                   key->u.prim == PIPE_PRIM_POLYGON ||
-                   key->u.prim == PIPE_PRIM_LINE_LOOP ||
-                   key->u.prim == PIPE_PRIM_TRIANGLE_FAN ||
-                   key->u.prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY ||
-                   (key->u.primitive_restart &&
-                    (sscreen->info.family < CHIP_POLARIS10 ||
-                     (key->u.prim != PIPE_PRIM_POINTS &&
-                      key->u.prim != PIPE_PRIM_LINE_STRIP &&
-                      key->u.prim != PIPE_PRIM_TRIANGLE_STRIP))) ||
-                   key->u.count_from_stream_output)
-                       wd_switch_on_eop = true;
-
-               /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.
-                * We don't know that for indirect drawing, so treat it as
-                * always problematic. */
-               if (sscreen->info.family == CHIP_HAWAII &&
-                   key->u.uses_instancing)
-                       wd_switch_on_eop = true;
-
-               /* Performance recommendation for 4 SE Gfx7-8 parts if
-                * instances are smaller than a primgroup.
-                * Assume indirect draws always use small instances.
-                * This is needed for good VS wave utilization.
-                */
-               if (sscreen->info.chip_class <= GFX8 &&
-                   sscreen->info.max_se == 4 &&
-                   key->u.multi_instances_smaller_than_primgroup)
-                       wd_switch_on_eop = true;
-
-               /* Required on GFX7 and later. */
-               if (sscreen->info.max_se == 4 && !wd_switch_on_eop)
-                       ia_switch_on_eoi = true;
-
-               /* HW engineers suggested that PARTIAL_VS_WAVE_ON should be set
-                * to work around a GS hang.
-                */
-               if (key->u.uses_gs &&
-                   (sscreen->info.family == CHIP_TONGA ||
-                    sscreen->info.family == CHIP_FIJI ||
-                    sscreen->info.family == CHIP_POLARIS10 ||
-                    sscreen->info.family == CHIP_POLARIS11 ||
-                    sscreen->info.family == CHIP_POLARIS12 ||
-                    sscreen->info.family == CHIP_VEGAM))
-                       partial_vs_wave = true;
-
-               /* Required by Hawaii and, for some special cases, by GFX8. */
-               if (ia_switch_on_eoi &&
-                   (sscreen->info.family == CHIP_HAWAII ||
-                    (sscreen->info.chip_class == GFX8 &&
-                     (key->u.uses_gs || max_primgroup_in_wave != 2))))
-                       partial_vs_wave = true;
-
-               /* Instancing bug on Bonaire. */
-               if (sscreen->info.family == CHIP_BONAIRE && ia_switch_on_eoi &&
-                   key->u.uses_instancing)
-                       partial_vs_wave = true;
-
-               /* This only applies to Polaris10 and later 4 SE chips.
-                * wd_switch_on_eop is already true on all other chips.
-                */
-               if (!wd_switch_on_eop && key->u.primitive_restart)
-                       partial_vs_wave = true;
-
-               /* If the WD switch is false, the IA switch must be false too. */
-               assert(wd_switch_on_eop || !ia_switch_on_eop);
-       }
-
-       /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
-       if (sscreen->info.chip_class <= GFX8 && ia_switch_on_eoi)
-               partial_es_wave = true;
-
-       return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) |
-               S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
-               S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
-               S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
-               S_028AA8_WD_SWITCH_ON_EOP(sscreen->info.chip_class >= GFX7 ? wd_switch_on_eop : 0) |
-               /* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
-               S_028AA8_MAX_PRIMGRP_IN_WAVE(sscreen->info.chip_class == GFX8 ?
-                                            max_primgroup_in_wave : 0) |
-               S_030960_EN_INST_OPT_BASIC(sscreen->info.chip_class >= GFX9) |
-               S_030960_EN_INST_OPT_ADV(sscreen->info.chip_class >= GFX9);
+   STATIC_ASSERT(sizeof(union si_vgt_param_key) == 4);
+   unsigned max_primgroup_in_wave = 2;
+
+   /* SWITCH_ON_EOP(0) is always preferable. */
+   bool wd_switch_on_eop = false;
+   bool ia_switch_on_eop = false;
+   bool ia_switch_on_eoi = false;
+   bool partial_vs_wave = false;
+   bool partial_es_wave = false;
+
+   if (key->u.uses_tess) {
+      /* SWITCH_ON_EOI must be set if PrimID is used. */
+      if (key->u.tess_uses_prim_id)
+         ia_switch_on_eoi = true;
+
+      /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
+      if ((sscreen->info.family == CHIP_TAHITI || sscreen->info.family == CHIP_PITCAIRN ||
+           sscreen->info.family == CHIP_BONAIRE) &&
+          key->u.uses_gs)
+         partial_vs_wave = true;
+
+      /* Needed for 028B6C_DISTRIBUTION_MODE != 0. (implies >= GFX8) */
+      if (sscreen->info.has_distributed_tess) {
+         if (key->u.uses_gs) {
+            if (sscreen->info.chip_class == GFX8)
+               partial_es_wave = true;
+         } else {
+            partial_vs_wave = true;
+         }
+      }
+   }
+
+   /* This is a hardware requirement. */
+   if (key->u.line_stipple_enabled || (sscreen->debug_flags & DBG(SWITCH_ON_EOP))) {
+      ia_switch_on_eop = true;
+      wd_switch_on_eop = true;
+   }
+
+   if (sscreen->info.chip_class >= GFX7) {
+      /* WD_SWITCH_ON_EOP has no effect on GPUs with less than
+       * 4 shader engines. Set 1 to pass the assertion below.
+       * The other cases are hardware requirements.
+       *
+       * Polaris supports primitive restart with WD_SWITCH_ON_EOP=0
+       * for points, line strips, and tri strips.
+       */
+      if (sscreen->info.max_se <= 2 || key->u.prim == PIPE_PRIM_POLYGON ||
+          key->u.prim == PIPE_PRIM_LINE_LOOP || key->u.prim == PIPE_PRIM_TRIANGLE_FAN ||
+          key->u.prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY ||
+          (key->u.primitive_restart &&
+           (sscreen->info.family < CHIP_POLARIS10 ||
+            (key->u.prim != PIPE_PRIM_POINTS && key->u.prim != PIPE_PRIM_LINE_STRIP &&
+             key->u.prim != PIPE_PRIM_TRIANGLE_STRIP))) ||
+          key->u.count_from_stream_output)
+         wd_switch_on_eop = true;
+
+      /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.
+       * We don't know that for indirect drawing, so treat it as
+       * always problematic. */
+      if (sscreen->info.family == CHIP_HAWAII && key->u.uses_instancing)
+         wd_switch_on_eop = true;
+
+      /* Performance recommendation for 4 SE Gfx7-8 parts if
+       * instances are smaller than a primgroup.
+       * Assume indirect draws always use small instances.
+       * This is needed for good VS wave utilization.
+       */
+      if (sscreen->info.chip_class <= GFX8 && sscreen->info.max_se == 4 &&
+          key->u.multi_instances_smaller_than_primgroup)
+         wd_switch_on_eop = true;
+
+      /* Required on GFX7 and later. */
+      if (sscreen->info.max_se == 4 && !wd_switch_on_eop)
+         ia_switch_on_eoi = true;
+
+      /* HW engineers suggested that PARTIAL_VS_WAVE_ON should be set
+       * to work around a GS hang.
+       */
+      if (key->u.uses_gs &&
+          (sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI ||
+           sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 ||
+           sscreen->info.family == CHIP_POLARIS12 || sscreen->info.family == CHIP_VEGAM))
+         partial_vs_wave = true;
+
+      /* Required by Hawaii and, for some special cases, by GFX8. */
+      if (ia_switch_on_eoi &&
+          (sscreen->info.family == CHIP_HAWAII ||
+           (sscreen->info.chip_class == GFX8 && (key->u.uses_gs || max_primgroup_in_wave != 2))))
+         partial_vs_wave = true;
+
+      /* Instancing bug on Bonaire. */
+      if (sscreen->info.family == CHIP_BONAIRE && ia_switch_on_eoi && key->u.uses_instancing)
+         partial_vs_wave = true;
+
+      /* This only applies to Polaris10 and later 4 SE chips.
+       * wd_switch_on_eop is already true on all other chips.
+       */
+      if (!wd_switch_on_eop && key->u.primitive_restart)
+         partial_vs_wave = true;
+
+      /* If the WD switch is false, the IA switch must be false too. */
+      assert(wd_switch_on_eop || !ia_switch_on_eop);
+   }
+
+   /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
+   if (sscreen->info.chip_class <= GFX8 && ia_switch_on_eoi)
+      partial_es_wave = true;
+
+   return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
+          S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
+          S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
+          S_028AA8_WD_SWITCH_ON_EOP(sscreen->info.chip_class >= GFX7 ? wd_switch_on_eop : 0) |
+          /* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
+          S_028AA8_MAX_PRIMGRP_IN_WAVE(sscreen->info.chip_class == GFX8 ? max_primgroup_in_wave
+                                                                        : 0) |
+          S_030960_EN_INST_OPT_BASIC(sscreen->info.chip_class >= GFX9) |
+          S_030960_EN_INST_OPT_ADV(sscreen->info.chip_class >= GFX9);
  }
  
  static void si_init_ia_multi_vgt_param_table(struct si_context *sctx)
  {
-       for (int prim = 0; prim <= SI_PRIM_RECTANGLE_LIST; prim++)
-       for (int uses_instancing = 0; uses_instancing < 2; uses_instancing++)
-       for (int multi_instances = 0; multi_instances < 2; multi_instances++)
-       for (int primitive_restart = 0; primitive_restart < 2; primitive_restart++)
-       for (int count_from_so = 0; count_from_so < 2; count_from_so++)
-       for (int line_stipple = 0; line_stipple < 2; line_stipple++)
-       for (int uses_tess = 0; uses_tess < 2; uses_tess++)
-       for (int tess_uses_primid = 0; tess_uses_primid < 2; tess_uses_primid++)
-       for (int uses_gs = 0; uses_gs < 2; uses_gs++) {
-               union si_vgt_param_key key;
-
-               key.index = 0;
-               key.u.prim = prim;
-               key.u.uses_instancing = uses_instancing;
-               key.u.multi_instances_smaller_than_primgroup = multi_instances;
-               key.u.primitive_restart = primitive_restart;
-               key.u.count_from_stream_output = count_from_so;
-               key.u.line_stipple_enabled = line_stipple;
-               key.u.uses_tess = uses_tess;
-               key.u.tess_uses_prim_id = tess_uses_primid;
-               key.u.uses_gs = uses_gs;
-
-               sctx->ia_multi_vgt_param[key.index] =
-                       si_get_init_multi_vgt_param(sctx->screen, &key);
-       }
+   for (int prim = 0; prim <= SI_PRIM_RECTANGLE_LIST; prim++)
+      for (int uses_instancing = 0; uses_instancing < 2; uses_instancing++)
+         for (int multi_instances = 0; multi_instances < 2; multi_instances++)
+            for (int primitive_restart = 0; primitive_restart < 2; primitive_restart++)
+               for (int count_from_so = 0; count_from_so < 2; count_from_so++)
+                  for (int line_stipple = 0; line_stipple < 2; line_stipple++)
+                     for (int uses_tess = 0; uses_tess < 2; uses_tess++)
+                        for (int tess_uses_primid = 0; tess_uses_primid < 2; tess_uses_primid++)
+                           for (int uses_gs = 0; uses_gs < 2; uses_gs++) {
+                              union si_vgt_param_key key;
+
+                              key.index = 0;
+                              key.u.prim = prim;
+                              key.u.uses_instancing = uses_instancing;
+                              key.u.multi_instances_smaller_than_primgroup = multi_instances;
+                              key.u.primitive_restart = primitive_restart;
+                              key.u.count_from_stream_output = count_from_so;
+                              key.u.line_stipple_enabled = line_stipple;
+                              key.u.uses_tess = uses_tess;
+                              key.u.tess_uses_prim_id = tess_uses_primid;
+                              key.u.uses_gs = uses_gs;
+
+                              sctx->ia_multi_vgt_param[key.index] =
+                                 si_get_init_multi_vgt_param(sctx->screen, &key);
+                           }
  }
  
  static bool si_is_line_stipple_enabled(struct si_context *sctx)
  {
-       struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
  
-       return rs->line_stipple_enable &&
-              sctx->current_rast_prim != PIPE_PRIM_POINTS &&
-              (rs->polygon_mode_is_lines ||
-               util_prim_is_lines(sctx->current_rast_prim));
+   return rs->line_stipple_enable && sctx->current_rast_prim != PIPE_PRIM_POINTS &&
+          (rs->polygon_mode_is_lines || util_prim_is_lines(sctx->current_rast_prim));
  }
  
  static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
-                                         const struct pipe_draw_info *info,
-                                         enum pipe_prim_type prim,
-                                         unsigned num_patches,
-                                         unsigned instance_count,
-                                         bool primitive_restart)
+                                          const struct pipe_draw_info *info,
+                                          enum pipe_prim_type prim, unsigned num_patches,
+                                          unsigned instance_count, bool primitive_restart)
  {
-       union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
-       unsigned primgroup_size;
-       unsigned ia_multi_vgt_param;
-
-       if (sctx->tes_shader.cso) {
-               primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
-       } else if (sctx->gs_shader.cso) {
-               primgroup_size = 64; /* recommended with a GS */
-       } else {
-               primgroup_size = 128; /* recommended without a GS and tess */
-       }
-
-       key.u.prim = prim;
-       key.u.uses_instancing = info->indirect || instance_count > 1;
-       key.u.multi_instances_smaller_than_primgroup =
-               info->indirect ||
-               (instance_count > 1 &&
-                (info->count_from_stream_output ||
-                 si_num_prims_for_vertices(info, prim) < primgroup_size));
-       key.u.primitive_restart = primitive_restart;
-       key.u.count_from_stream_output = info->count_from_stream_output != NULL;
-       key.u.line_stipple_enabled = si_is_line_stipple_enabled(sctx);
-
-       ia_multi_vgt_param = sctx->ia_multi_vgt_param[key.index] |
-                            S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1);
-
-       if (sctx->gs_shader.cso) {
-               /* GS requirement. */
-               if (sctx->chip_class <= GFX8 &&
-                   SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
-                       ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1);
-
-               /* GS hw bug with single-primitive instances and SWITCH_ON_EOI.
-                * The hw doc says all multi-SE chips are affected, but Vulkan
-                * only applies it to Hawaii. Do what Vulkan does.
-                */
-               if (sctx->family == CHIP_HAWAII &&
-                   G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
-                   (info->indirect ||
-                    (instance_count > 1 &&
-                     (info->count_from_stream_output ||
-                      si_num_prims_for_vertices(info, prim) <= 1))))
-                       sctx->flags |= SI_CONTEXT_VGT_FLUSH;
-       }
-
-       return ia_multi_vgt_param;
+   union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
+   unsigned primgroup_size;
+   unsigned ia_multi_vgt_param;
+
+   if (sctx->tes_shader.cso) {
+      primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
+   } else if (sctx->gs_shader.cso) {
+      primgroup_size = 64; /* recommended with a GS */
+   } else {
+      primgroup_size = 128; /* recommended without a GS and tess */
+   }
+
+   key.u.prim = prim;
+   key.u.uses_instancing = info->indirect || instance_count > 1;
+   key.u.multi_instances_smaller_than_primgroup =
+      info->indirect ||
+      (instance_count > 1 &&
+       (info->count_from_stream_output || si_num_prims_for_vertices(info, prim) < primgroup_size));
+   key.u.primitive_restart = primitive_restart;
+   key.u.count_from_stream_output = info->count_from_stream_output != NULL;
+   key.u.line_stipple_enabled = si_is_line_stipple_enabled(sctx);
+
+   ia_multi_vgt_param =
+      sctx->ia_multi_vgt_param[key.index] | S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1);
+
+   if (sctx->gs_shader.cso) {
+      /* GS requirement. */
+      if (sctx->chip_class <= GFX8 &&
+          SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
+         ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1);
+
+      /* GS hw bug with single-primitive instances and SWITCH_ON_EOI.
+       * The hw doc says all multi-SE chips are affected, but Vulkan
+       * only applies it to Hawaii. Do what Vulkan does.
+       */
+      if (sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
+          (info->indirect || (instance_count > 1 && (info->count_from_stream_output ||
+                                                     si_num_prims_for_vertices(info, prim) <= 1))))
+         sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+   }
+
+   return ia_multi_vgt_param;
  }
  
  static unsigned si_conv_prim_to_gs_out(unsigned mode)
  {
-       static const int prim_conv[] = {
-               [PIPE_PRIM_POINTS]                      = V_028A6C_OUTPRIM_TYPE_POINTLIST,
-               [PIPE_PRIM_LINES]                       = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-               [PIPE_PRIM_LINE_LOOP]                   = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-               [PIPE_PRIM_LINE_STRIP]                  = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-               [PIPE_PRIM_TRIANGLES]                   = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               [PIPE_PRIM_TRIANGLE_STRIP]              = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               [PIPE_PRIM_TRIANGLE_FAN]                = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               [PIPE_PRIM_QUADS]                       = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               [PIPE_PRIM_QUAD_STRIP]                  = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               [PIPE_PRIM_POLYGON]                     = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               [PIPE_PRIM_LINES_ADJACENCY]             = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-               [PIPE_PRIM_LINE_STRIP_ADJACENCY]        = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-               [PIPE_PRIM_TRIANGLES_ADJACENCY]         = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]    = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               [PIPE_PRIM_PATCHES]                     = V_028A6C_OUTPRIM_TYPE_POINTLIST,
-               [SI_PRIM_RECTANGLE_LIST]                = V_028A6C_VGT_OUT_RECT_V0,
-       };
-       assert(mode < ARRAY_SIZE(prim_conv));
-
-       return prim_conv[mode];
+   static const int prim_conv[] = {
+      [PIPE_PRIM_POINTS] = V_028A6C_OUTPRIM_TYPE_POINTLIST,
+      [PIPE_PRIM_LINES] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+      [PIPE_PRIM_LINE_LOOP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+      [PIPE_PRIM_LINE_STRIP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+      [PIPE_PRIM_TRIANGLES] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_TRIANGLE_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_TRIANGLE_FAN] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_QUADS] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_QUAD_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_POLYGON] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_LINES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+      [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+      [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_PATCHES] = V_028A6C_OUTPRIM_TYPE_POINTLIST,
+      [SI_PRIM_RECTANGLE_LIST] = V_028A6C_VGT_OUT_RECT_V0,
+   };
+   assert(mode < ARRAY_SIZE(prim_conv));
+
+   return prim_conv[mode];
  }
  
  /* rast_prim is the primitive type after GS. */
  static void si_emit_rasterizer_prim_state(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       enum pipe_prim_type rast_prim = sctx->current_rast_prim;
-       struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-       unsigned initial_cdw = cs->current.cdw;
-
-       if (unlikely(si_is_line_stipple_enabled(sctx))) {
-               /* For lines, reset the stipple pattern at each primitive. Otherwise,
-                * reset the stipple pattern at each packet (line strips, line loops).
-                */
-               unsigned value = rs->pa_sc_line_stipple |
-                                S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1 : 2);
-
-               radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE,
-                                          SI_TRACKED_PA_SC_LINE_STIPPLE, value);
-       }
-
-       unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim);
-       if (unlikely(gs_out_prim != sctx->last_gs_out_prim &&
-                    (sctx->ngg || sctx->gs_shader.cso))) {
-               radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
-               sctx->last_gs_out_prim = gs_out_prim;
-       }
-
-       if (initial_cdw != cs->current.cdw)
-               sctx->context_roll = true;
-
-       if (sctx->ngg) {
-               unsigned vtx_index = rs->flatshade_first ? 0 : gs_out_prim;
-
-               sctx->current_vs_state &= C_VS_STATE_OUTPRIM &
-                                         C_VS_STATE_PROVOKING_VTX_INDEX;
-               sctx->current_vs_state |= S_VS_STATE_OUTPRIM(gs_out_prim) |
-                                         S_VS_STATE_PROVOKING_VTX_INDEX(vtx_index);
-       }
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   enum pipe_prim_type rast_prim = sctx->current_rast_prim;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   unsigned initial_cdw = cs->current.cdw;
+
+   if (unlikely(si_is_line_stipple_enabled(sctx))) {
+      /* For lines, reset the stipple pattern at each primitive. Otherwise,
+       * reset the stipple pattern at each packet (line strips, line loops).
+       */
+      unsigned value =
+         rs->pa_sc_line_stipple | S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1 : 2);
+
+      radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE, SI_TRACKED_PA_SC_LINE_STIPPLE,
+                                 value);
+   }
+
+   unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim);
+   if (unlikely(gs_out_prim != sctx->last_gs_out_prim && (sctx->ngg || sctx->gs_shader.cso))) {
+      radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
+      sctx->last_gs_out_prim = gs_out_prim;
+   }
+
+   if (initial_cdw != cs->current.cdw)
+      sctx->context_roll = true;
+
+   if (sctx->ngg) {
+      unsigned vtx_index = rs->flatshade_first ? 0 : gs_out_prim;
+
+      sctx->current_vs_state &= C_VS_STATE_OUTPRIM & C_VS_STATE_PROVOKING_VTX_INDEX;
+      sctx->current_vs_state |=
+         S_VS_STATE_OUTPRIM(gs_out_prim) | S_VS_STATE_PROVOKING_VTX_INDEX(vtx_index);
+   }
  }
  
-static void si_emit_vs_state(struct si_context *sctx,
-                            const struct pipe_draw_info *info)
+static void si_emit_vs_state(struct si_context *sctx, const struct pipe_draw_info *info)
  {
-       sctx->current_vs_state &= C_VS_STATE_INDEXED;
-       sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->index_size);
-
-       if (sctx->num_vs_blit_sgprs) {
-               /* Re-emit the state after we leave u_blitter. */
-               sctx->last_vs_state = ~0;
-               return;
-       }
-
-       if (sctx->current_vs_state != sctx->last_vs_state) {
-               struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-               /* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */
-               radeon_set_sh_reg(cs,
-                       sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] +
-                       SI_SGPR_VS_STATE_BITS * 4,
-                       sctx->current_vs_state);
-
-               /* Set CLAMP_VERTEX_COLOR and OUTPRIM in the last stage
-                * before the rasterizer.
-                *
-                * For TES or the GS copy shader without NGG:
-                */
-               if (sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] !=
-                   R_00B130_SPI_SHADER_USER_DATA_VS_0) {
-                       radeon_set_sh_reg(cs,
-                               R_00B130_SPI_SHADER_USER_DATA_VS_0 +
-                               SI_SGPR_VS_STATE_BITS * 4,
-                               sctx->current_vs_state);
-               }
-
-               /* For NGG: */
-               if (sctx->screen->use_ngg &&
-                   sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] !=
-                   R_00B230_SPI_SHADER_USER_DATA_GS_0) {
-                       radeon_set_sh_reg(cs,
-                                         R_00B230_SPI_SHADER_USER_DATA_GS_0 +
-                                         SI_SGPR_VS_STATE_BITS * 4,
-                                         sctx->current_vs_state);
-               }
-
-               sctx->last_vs_state = sctx->current_vs_state;
-       }
+   sctx->current_vs_state &= C_VS_STATE_INDEXED;
+   sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->index_size);
+
+   if (sctx->num_vs_blit_sgprs) {
+      /* Re-emit the state after we leave u_blitter. */
+      sctx->last_vs_state = ~0;
+      return;
+   }
+
+   if (sctx->current_vs_state != sctx->last_vs_state) {
+      struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+      /* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */
+      radeon_set_sh_reg(
+         cs, sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_STATE_BITS * 4,
+         sctx->current_vs_state);
+
+      /* Set CLAMP_VERTEX_COLOR and OUTPRIM in the last stage
+       * before the rasterizer.
+       *
+       * For TES or the GS copy shader without NGG:
+       */
+      if (sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] != R_00B130_SPI_SHADER_USER_DATA_VS_0) {
+         radeon_set_sh_reg(cs, R_00B130_SPI_SHADER_USER_DATA_VS_0 + SI_SGPR_VS_STATE_BITS * 4,
+                           sctx->current_vs_state);
+      }
+
+      /* For NGG: */
+      if (sctx->screen->use_ngg &&
+          sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] != R_00B230_SPI_SHADER_USER_DATA_GS_0) {
+         radeon_set_sh_reg(cs, R_00B230_SPI_SHADER_USER_DATA_GS_0 + SI_SGPR_VS_STATE_BITS * 4,
+                           sctx->current_vs_state);
+      }
+
+      sctx->last_vs_state = sctx->current_vs_state;
+   }
  }
  
-static inline bool si_prim_restart_index_changed(struct si_context *sctx,
-                                                bool primitive_restart,
-                                                unsigned restart_index)
+static inline bool si_prim_restart_index_changed(struct si_context *sctx, bool primitive_restart,
+                                                 unsigned restart_index)
  {
-       return primitive_restart &&
-              (restart_index != sctx->last_restart_index ||
-               sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN);
+   return primitive_restart && (restart_index != sctx->last_restart_index ||
+                                sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN);
  }
  
-static void si_emit_ia_multi_vgt_param(struct si_context *sctx,
-                                      const struct pipe_draw_info *info,
-                                      enum pipe_prim_type prim,
-                                      unsigned num_patches,
-                                      unsigned instance_count,
-                                      bool primitive_restart)
+static void si_emit_ia_multi_vgt_param(struct si_context *sctx, const struct pipe_draw_info *info,
+                                       enum pipe_prim_type prim, unsigned num_patches,
+                                       unsigned instance_count, bool primitive_restart)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       unsigned ia_multi_vgt_param;
-
-       ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, prim, num_patches,
-                                                      instance_count, primitive_restart);
-
-       /* Draw state. */
-       if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
-               if (sctx->chip_class == GFX9)
-                       radeon_set_uconfig_reg_idx(cs, sctx->screen,
-                                                  R_030960_IA_MULTI_VGT_PARAM, 4,
-                                                  ia_multi_vgt_param);
-               else if (sctx->chip_class >= GFX7)
-                       radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
-               else
-                       radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
-
-               sctx->last_multi_vgt_param = ia_multi_vgt_param;
-       }
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned ia_multi_vgt_param;
+
+   ia_multi_vgt_param =
+      si_get_ia_multi_vgt_param(sctx, info, prim, num_patches, instance_count, primitive_restart);
+
+   /* Draw state. */
+   if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
+      if (sctx->chip_class == GFX9)
+         radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030960_IA_MULTI_VGT_PARAM, 4,
+                                    ia_multi_vgt_param);
+      else if (sctx->chip_class >= GFX7)
+         radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
+      else
+         radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
+
+      sctx->last_multi_vgt_param = ia_multi_vgt_param;
+   }
  }
  
  /* GFX10 removed IA_MULTI_VGT_PARAM in exchange for GE_CNTL.
@@ -720,1601 +662,1460 @@ static void si_emit_ia_multi_vgt_param(struct si_context *sctx,
   */
  static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches)
  {
-       union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
-       unsigned ge_cntl;
-
-       if (sctx->ngg) {
-               if (sctx->tes_shader.cso) {
-                       ge_cntl = S_03096C_PRIM_GRP_SIZE(num_patches) |
-                                 S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
-                                 S_03096C_BREAK_WAVE_AT_EOI(key.u.tess_uses_prim_id);
-               } else {
-                       ge_cntl = si_get_vs_state(sctx)->ge_cntl;
-               }
-       } else {
-               unsigned primgroup_size;
-               unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */;
-
-               if (sctx->tes_shader.cso) {
-                       primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
-               } else if (sctx->gs_shader.cso) {
-                       unsigned vgt_gs_onchip_cntl = sctx->gs_shader.current->ctx_reg.gs.vgt_gs_onchip_cntl;
-                       primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
-               } else {
-                       primgroup_size = 128; /* recommended without a GS and tess */
-               }
-
-               ge_cntl = S_03096C_PRIM_GRP_SIZE(primgroup_size) |
-                         S_03096C_VERT_GRP_SIZE(vertgroup_size) |
-                         S_03096C_BREAK_WAVE_AT_EOI(key.u.uses_tess && key.u.tess_uses_prim_id);
-       }
-
-       ge_cntl |= S_03096C_PACKET_TO_ONE_PA(si_is_line_stipple_enabled(sctx));
-
-       if (ge_cntl != sctx->last_multi_vgt_param) {
-               radeon_set_uconfig_reg(sctx->gfx_cs, R_03096C_GE_CNTL, ge_cntl);
-               sctx->last_multi_vgt_param = ge_cntl;
-       }
+   union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
+   unsigned ge_cntl;
+
+   if (sctx->ngg) {
+      if (sctx->tes_shader.cso) {
+         ge_cntl = S_03096C_PRIM_GRP_SIZE(num_patches) |
+                   S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
+                   S_03096C_BREAK_WAVE_AT_EOI(key.u.tess_uses_prim_id);
+      } else {
+         ge_cntl = si_get_vs_state(sctx)->ge_cntl;
+      }
+   } else {
+      unsigned primgroup_size;
+      unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */
+      ;
+
+      if (sctx->tes_shader.cso) {
+         primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
+      } else if (sctx->gs_shader.cso) {
+         unsigned vgt_gs_onchip_cntl = sctx->gs_shader.current->ctx_reg.gs.vgt_gs_onchip_cntl;
+         primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
+      } else {
+         primgroup_size = 128; /* recommended without a GS and tess */
+      }
+
+      ge_cntl = S_03096C_PRIM_GRP_SIZE(primgroup_size) | S_03096C_VERT_GRP_SIZE(vertgroup_size) |
+                S_03096C_BREAK_WAVE_AT_EOI(key.u.uses_tess && key.u.tess_uses_prim_id);
+   }
+
+   ge_cntl |= S_03096C_PACKET_TO_ONE_PA(si_is_line_stipple_enabled(sctx));
+
+   if (ge_cntl != sctx->last_multi_vgt_param) {
+      radeon_set_uconfig_reg(sctx->gfx_cs, R_03096C_GE_CNTL, ge_cntl);
+      sctx->last_multi_vgt_param = ge_cntl;
+   }
  }
  
-static void si_emit_draw_registers(struct si_context *sctx,
-                                  const struct pipe_draw_info *info,
-                                  enum pipe_prim_type prim,
-                                  unsigned num_patches,
-                                  unsigned instance_count,
-                                  bool primitive_restart)
+static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_draw_info *info,
+                                   enum pipe_prim_type prim, unsigned num_patches,
+                                   unsigned instance_count, bool primitive_restart)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       unsigned vgt_prim = si_conv_pipe_prim(prim);
-
-       if (sctx->chip_class >= GFX10)
-               gfx10_emit_ge_cntl(sctx, num_patches);
-       else
-               si_emit_ia_multi_vgt_param(sctx, info, prim, num_patches,
-                                          instance_count, primitive_restart);
-
-       if (vgt_prim != sctx->last_prim) {
-               if (sctx->chip_class >= GFX10)
-                       radeon_set_uconfig_reg(cs, R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);
-               else if (sctx->chip_class >= GFX7)
-                       radeon_set_uconfig_reg_idx(cs, sctx->screen,
-                                                  R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
-               else
-                       radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, vgt_prim);
-
-               sctx->last_prim = vgt_prim;
-       }
-
-       /* Primitive restart. */
-       if (primitive_restart != sctx->last_primitive_restart_en) {
-               if (sctx->chip_class >= GFX9)
-                       radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
-                                              primitive_restart);
-               else
-                       radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
-                                              primitive_restart);
-
-               sctx->last_primitive_restart_en = primitive_restart;
-
-       }
-       if (si_prim_restart_index_changed(sctx, primitive_restart, info->restart_index)) {
-               radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
-                                      info->restart_index);
-               sctx->last_restart_index = info->restart_index;
-               sctx->context_roll = true;
-       }
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned vgt_prim = si_conv_pipe_prim(prim);
+
+   if (sctx->chip_class >= GFX10)
+      gfx10_emit_ge_cntl(sctx, num_patches);
+   else
+      si_emit_ia_multi_vgt_param(sctx, info, prim, num_patches, instance_count, primitive_restart);
+
+   if (vgt_prim != sctx->last_prim) {
+      if (sctx->chip_class >= GFX10)
+         radeon_set_uconfig_reg(cs, R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);
+      else if (sctx->chip_class >= GFX7)
+         radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
+      else
+         radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, vgt_prim);
+
+      sctx->last_prim = vgt_prim;
+   }
+
+   /* Primitive restart. */
+   if (primitive_restart != sctx->last_primitive_restart_en) {
+      if (sctx->chip_class >= GFX9)
+         radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
+      else
+         radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
+
+      sctx->last_primitive_restart_en = primitive_restart;
+   }
+   if (si_prim_restart_index_changed(sctx, primitive_restart, info->restart_index)) {
+      radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, info->restart_index);
+      sctx->last_restart_index = info->restart_index;
+      sctx->context_roll = true;
+   }
  }
  
-static void si_emit_draw_packets(struct si_context *sctx,
-                                const struct pipe_draw_info *info,
-                                struct pipe_resource *indexbuf,
-                                unsigned index_size,
-                                unsigned index_offset,
-                                unsigned instance_count,
-                                bool dispatch_prim_discard_cs,
-                                unsigned original_index_size)
+static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
+                                 struct pipe_resource *indexbuf, unsigned index_size,
+                                 unsigned index_offset, unsigned instance_count,
+                                 bool dispatch_prim_discard_cs, unsigned original_index_size)
  {
-       struct pipe_draw_indirect_info *indirect = info->indirect;
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
-       bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
-       uint32_t index_max_size = 0;
-       uint64_t index_va = 0;
-
-       if (info->count_from_stream_output) {
-               struct si_streamout_target *t =
-                       (struct si_streamout_target*)info->count_from_stream_output;
-
-               radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
-                                      t->stride_in_dw);
-               si_cp_copy_data(sctx, sctx->gfx_cs,
-                               COPY_DATA_REG, NULL,
-                               R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2,
-                               COPY_DATA_SRC_MEM, t->buf_filled_size,
-                               t->buf_filled_size_offset);
-       }
-
-       /* draw packet */
-       if (index_size) {
-               if (index_size != sctx->last_index_size) {
-                       unsigned index_type;
-
-                       /* index type */
-                       switch (index_size) {
-                       case 1:
-                               index_type = V_028A7C_VGT_INDEX_8;
-                               break;
-                       case 2:
-                               index_type = V_028A7C_VGT_INDEX_16 |
-                                            (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ?
-                                                     V_028A7C_VGT_DMA_SWAP_16_BIT : 0);
-                               break;
-                       case 4:
-                               index_type = V_028A7C_VGT_INDEX_32 |
-                                            (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ?
-                                                     V_028A7C_VGT_DMA_SWAP_32_BIT : 0);
-                               break;
-                       default:
-                               assert(!"unreachable");
-                               return;
-                       }
-
-                       if (sctx->chip_class >= GFX9) {
-                               radeon_set_uconfig_reg_idx(cs, sctx->screen,
-                                                          R_03090C_VGT_INDEX_TYPE, 2,
-                                                          index_type);
-                       } else {
-                               radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
-                               radeon_emit(cs, index_type);
-                       }
-
-                       sctx->last_index_size = index_size;
-               }
-
-               if (original_index_size) {
-                       index_max_size = (indexbuf->width0 - index_offset) /
-                                         original_index_size;
-                       /* Skip draw calls with 0-sized index buffers.
-                        * They cause a hang on some chips, like Navi10-14.
-                        */
-                       if (!index_max_size)
-                               return;
-
-                       index_va = si_resource(indexbuf)->gpu_address + index_offset;
-
-                       radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                             si_resource(indexbuf),
-                                             RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
-               }
-       } else {
-               /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
-                * so the state must be re-emitted before the next indexed draw.
-                */
-               if (sctx->chip_class >= GFX7)
-                       sctx->last_index_size = -1;
-       }
-
-       if (indirect) {
-               uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address;
-
-               assert(indirect_va % 8 == 0);
-
-               si_invalidate_draw_sh_constants(sctx);
-
-               radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
-               radeon_emit(cs, 1);
-               radeon_emit(cs, indirect_va);
-               radeon_emit(cs, indirect_va >> 32);
-
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                     si_resource(indirect->buffer),
-                                     RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
-
-               unsigned di_src_sel = index_size ? V_0287F0_DI_SRC_SEL_DMA
-                                                   : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
-
-               assert(indirect->offset % 4 == 0);
-
-               if (index_size) {
-                       radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
-                       radeon_emit(cs, index_va);
-                       radeon_emit(cs, index_va >> 32);
-
-                       radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
-                       radeon_emit(cs, index_max_size);
-               }
-
-               if (!sctx->screen->has_draw_indirect_multi) {
-                       radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT
-                                                          : PKT3_DRAW_INDIRECT,
-                                            3, render_cond_bit));
-                       radeon_emit(cs, indirect->offset);
-                       radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
-                       radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
-                       radeon_emit(cs, di_src_sel);
-               } else {
-                       uint64_t count_va = 0;
-
-                       if (indirect->indirect_draw_count) {
-                               struct si_resource *params_buf =
-                                       si_resource(indirect->indirect_draw_count);
-
-                               radeon_add_to_buffer_list(
-                                       sctx, sctx->gfx_cs, params_buf,
-                                       RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
-
-                               count_va = params_buf->gpu_address + indirect->indirect_draw_count_offset;
-                       }
-
-                       radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
-                                                            PKT3_DRAW_INDIRECT_MULTI,
-                                            8, render_cond_bit));
-                       radeon_emit(cs, indirect->offset);
-                       radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
-                       radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
-                       radeon_emit(cs, ((sh_base_reg + SI_SGPR_DRAWID * 4 - SI_SH_REG_OFFSET) >> 2) |
-                                       S_2C3_DRAW_INDEX_ENABLE(1) |
-                                       S_2C3_COUNT_INDIRECT_ENABLE(!!indirect->indirect_draw_count));
-                       radeon_emit(cs, indirect->draw_count);
-                       radeon_emit(cs, count_va);
-                       radeon_emit(cs, count_va >> 32);
-                       radeon_emit(cs, indirect->stride);
-                       radeon_emit(cs, di_src_sel);
-               }
-       } else {
-               int base_vertex;
-
-               if (sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN ||
-                   sctx->last_instance_count != instance_count) {
-                       radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
-                       radeon_emit(cs, instance_count);
-                       sctx->last_instance_count = instance_count;
-               }
-
-               /* Base vertex and start instance. */
-               base_vertex = original_index_size ? info->index_bias : info->start;
-
-               if (sctx->num_vs_blit_sgprs) {
-                       /* Re-emit draw constants after we leave u_blitter. */
-                       si_invalidate_draw_sh_constants(sctx);
-
-                       /* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */
-                       radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4,
-                                             sctx->num_vs_blit_sgprs);
-                       radeon_emit_array(cs, sctx->vs_blit_sh_data,
-                                         sctx->num_vs_blit_sgprs);
-               } else if (base_vertex != sctx->last_base_vertex ||
-                          sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||
-                          info->start_instance != sctx->last_start_instance ||
-                          info->drawid != sctx->last_drawid ||
-                          sh_base_reg != sctx->last_sh_base_reg) {
-                       radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3);
-                       radeon_emit(cs, base_vertex);
-                       radeon_emit(cs, info->start_instance);
-                       radeon_emit(cs, info->drawid);
-
-                       sctx->last_base_vertex = base_vertex;
-                       sctx->last_start_instance = info->start_instance;
-                       sctx->last_drawid = info->drawid;
-                       sctx->last_sh_base_reg = sh_base_reg;
-               }
-
-               if (index_size) {
-                       if (dispatch_prim_discard_cs) {
-                               index_va += info->start * original_index_size;
-                               index_max_size = MIN2(index_max_size, info->count);
-
-                               si_dispatch_prim_discard_cs_and_draw(sctx, info,
-                                                                    original_index_size,
-                                                                    base_vertex,
-                                                                    index_va, index_max_size);
-                               return;
-                       }
-
-                       index_va += info->start * index_size;
-
-                       radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
-                       radeon_emit(cs, index_max_size);
-                       radeon_emit(cs, index_va);
-                       radeon_emit(cs, index_va >> 32);
-                       radeon_emit(cs, info->count);
-                       radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
-               } else {
-                       radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
-                       radeon_emit(cs, info->count);
-                       radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
-                                       S_0287F0_USE_OPAQUE(!!info->count_from_stream_output));
-               }
-       }
+   struct pipe_draw_indirect_info *indirect = info->indirect;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
+   bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
+   uint32_t index_max_size = 0;
+   uint64_t index_va = 0;
+
+   if (info->count_from_stream_output) {
+      struct si_streamout_target *t = (struct si_streamout_target *)info->count_from_stream_output;
+
+      radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, t->stride_in_dw);
+      si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_REG, NULL,
+                      R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, COPY_DATA_SRC_MEM,
+                      t->buf_filled_size, t->buf_filled_size_offset);
+   }
+
+   /* draw packet */
+   if (index_size) {
+      if (index_size != sctx->last_index_size) {
+         unsigned index_type;
+
+         /* index type */
+         switch (index_size) {
+         case 1:
+            index_type = V_028A7C_VGT_INDEX_8;
+            break;
+         case 2:
+            index_type =
+               V_028A7C_VGT_INDEX_16 |
+               (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ? V_028A7C_VGT_DMA_SWAP_16_BIT : 0);
+            break;
+         case 4:
+            index_type =
+               V_028A7C_VGT_INDEX_32 |
+               (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ? V_028A7C_VGT_DMA_SWAP_32_BIT : 0);
+            break;
+         default:
+            assert(!"unreachable");
+            return;
+         }
+
+         if (sctx->chip_class >= GFX9) {
+            radeon_set_uconfig_reg_idx(cs, sctx->screen, R_03090C_VGT_INDEX_TYPE, 2, index_type);
+         } else {
+            radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
+            radeon_emit(cs, index_type);
+         }
+
+         sctx->last_index_size = index_size;
+      }
+
+      if (original_index_size) {
+         index_max_size = (indexbuf->width0 - index_offset) / original_index_size;
+         /* Skip draw calls with 0-sized index buffers.
+          * They cause a hang on some chips, like Navi10-14.
+          */
+         if (!index_max_size)
+            return;
+
+         index_va = si_resource(indexbuf)->gpu_address + index_offset;
+
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ,
+                                   RADEON_PRIO_INDEX_BUFFER);
+      }
+   } else {
+      /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
+       * so the state must be re-emitted before the next indexed draw.
+       */
+      if (sctx->chip_class >= GFX7)
+         sctx->last_index_size = -1;
+   }
+
+   if (indirect) {
+      uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address;
+
+      assert(indirect_va % 8 == 0);
+
+      si_invalidate_draw_sh_constants(sctx);
+
+      radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
+      radeon_emit(cs, 1);
+      radeon_emit(cs, indirect_va);
+      radeon_emit(cs, indirect_va >> 32);
+
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(indirect->buffer),
+                                RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
+
+      unsigned di_src_sel = index_size ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
+
+      assert(indirect->offset % 4 == 0);
+
+      if (index_size) {
+         radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
+         radeon_emit(cs, index_va);
+         radeon_emit(cs, index_va >> 32);
+
+         radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
+         radeon_emit(cs, index_max_size);
+      }
+
+      if (!sctx->screen->has_draw_indirect_multi) {
+         radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3,
+                              render_cond_bit));
+         radeon_emit(cs, indirect->offset);
+         radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
+         radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
+         radeon_emit(cs, di_src_sel);
+      } else {
+         uint64_t count_va = 0;
+
+         if (indirect->indirect_draw_count) {
+            struct si_resource *params_buf = si_resource(indirect->indirect_draw_count);
+
+            radeon_add_to_buffer_list(sctx, sctx->gfx_cs, params_buf, RADEON_USAGE_READ,
+                                      RADEON_PRIO_DRAW_INDIRECT);
+
+            count_va = params_buf->gpu_address + indirect->indirect_draw_count_offset;
+         }
+
+         radeon_emit(cs,
+                     PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,
+                          render_cond_bit));
+         radeon_emit(cs, indirect->offset);
+         radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
+         radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
+         radeon_emit(cs, ((sh_base_reg + SI_SGPR_DRAWID * 4 - SI_SH_REG_OFFSET) >> 2) |
+                            S_2C3_DRAW_INDEX_ENABLE(1) |
+                            S_2C3_COUNT_INDIRECT_ENABLE(!!indirect->indirect_draw_count));
+         radeon_emit(cs, indirect->draw_count);
+         radeon_emit(cs, count_va);
+         radeon_emit(cs, count_va >> 32);
+         radeon_emit(cs, indirect->stride);
+         radeon_emit(cs, di_src_sel);
+      }
+   } else {
+      int base_vertex;
+
+      if (sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN ||
+          sctx->last_instance_count != instance_count) {
+         radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
+         radeon_emit(cs, instance_count);
+         sctx->last_instance_count = instance_count;
+      }
+
+      /* Base vertex and start instance. */
+      base_vertex = original_index_size ? info->index_bias : info->start;
+
+      if (sctx->num_vs_blit_sgprs) {
+         /* Re-emit draw constants after we leave u_blitter. */
+         si_invalidate_draw_sh_constants(sctx);
+
+         /* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */
+         radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4, sctx->num_vs_blit_sgprs);
+         radeon_emit_array(cs, sctx->vs_blit_sh_data, sctx->num_vs_blit_sgprs);
+      } else if (base_vertex != sctx->last_base_vertex ||
+                 sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||
+                 info->start_instance != sctx->last_start_instance ||
+                 info->drawid != sctx->last_drawid || sh_base_reg != sctx->last_sh_base_reg) {
+         radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3);
+         radeon_emit(cs, base_vertex);
+         radeon_emit(cs, info->start_instance);
+         radeon_emit(cs, info->drawid);
+
+         sctx->last_base_vertex = base_vertex;
+         sctx->last_start_instance = info->start_instance;
+         sctx->last_drawid = info->drawid;
+         sctx->last_sh_base_reg = sh_base_reg;
+      }
+
+      if (index_size) {
+         if (dispatch_prim_discard_cs) {
+            index_va += info->start * original_index_size;
+            index_max_size = MIN2(index_max_size, info->count);
+
+            si_dispatch_prim_discard_cs_and_draw(sctx, info, original_index_size, base_vertex,
+                                                 index_va, index_max_size);
+            return;
+         }
+
+         index_va += info->start * index_size;
+
+         radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
+         radeon_emit(cs, index_max_size);
+         radeon_emit(cs, index_va);
+         radeon_emit(cs, index_va >> 32);
+         radeon_emit(cs, info->count);
+         radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
+      } else {
+         radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
+         radeon_emit(cs, info->count);
+         radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
+                            S_0287F0_USE_OPAQUE(!!info->count_from_stream_output));
+      }
+   }
  }
  
-void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
-                         unsigned cp_coher_cntl)
+void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
  {
-       bool compute_ib = !sctx->has_graphics ||
-                         cs == sctx->prim_discard_compute_cs;
-
-       assert(sctx->chip_class <= GFX9);
-
-       if (sctx->chip_class == GFX9 || compute_ib) {
-               /* Flush caches and wait for the caches to assert idle. */
-               radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));
-               radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
-               radeon_emit(cs, 0xffffffff);    /* CP_COHER_SIZE */
-               radeon_emit(cs, 0xffffff);      /* CP_COHER_SIZE_HI */
-               radeon_emit(cs, 0);             /* CP_COHER_BASE */
-               radeon_emit(cs, 0);             /* CP_COHER_BASE_HI */
-               radeon_emit(cs, 0x0000000A);    /* POLL_INTERVAL */
-       } else {
-               /* ACQUIRE_MEM is only required on a compute ring. */
-               radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
-               radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
-               radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
-               radeon_emit(cs, 0);               /* CP_COHER_BASE */
-               radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
-       }
-
-       /* ACQUIRE_MEM has an implicit context roll if the current context
-        * is busy. */
-       if (!compute_ib)
-               sctx->context_roll = true;
+   bool compute_ib = !sctx->has_graphics || cs == sctx->prim_discard_compute_cs;
+
+   assert(sctx->chip_class <= GFX9);
+
+   if (sctx->chip_class == GFX9 || compute_ib) {
+      /* Flush caches and wait for the caches to assert idle. */
+      radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));
+      radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
+      radeon_emit(cs, 0xffffffff);    /* CP_COHER_SIZE */
+      radeon_emit(cs, 0xffffff);      /* CP_COHER_SIZE_HI */
+      radeon_emit(cs, 0);             /* CP_COHER_BASE */
+      radeon_emit(cs, 0);             /* CP_COHER_BASE_HI */
+      radeon_emit(cs, 0x0000000A);    /* POLL_INTERVAL */
+   } else {
+      /* ACQUIRE_MEM is only required on a compute ring. */
+      radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
+      radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
+      radeon_emit(cs, 0xffffffff);    /* CP_COHER_SIZE */
+      radeon_emit(cs, 0);             /* CP_COHER_BASE */
+      radeon_emit(cs, 0x0000000A);    /* POLL_INTERVAL */
+   }
+
+   /* ACQUIRE_MEM has an implicit context roll if the current context
+    * is busy. */
+   if (!compute_ib)
+      sctx->context_roll = true;
  }
  
  void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
  {
-       if (!si_compute_prim_discard_enabled(sctx))
-               return;
-
-       if (!sctx->barrier_buf) {
-               u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4,
-                                    &sctx->barrier_buf_offset,
-                                    (struct pipe_resource**)&sctx->barrier_buf);
-       }
-
-       /* Emit a placeholder to signal the next compute IB to start.
-        * See si_compute_prim_discard.c for explanation.
-        */
-       uint32_t signal = 1;
-       si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset,
-                        4, V_370_MEM, V_370_ME, &signal);
-
-       sctx->last_pkt3_write_data =
-                       &sctx->gfx_cs->current.buf[sctx->gfx_cs->current.cdw - 5];
-
-       /* Only the last occurence of WRITE_DATA will be executed.
-        * The packet will be enabled in si_flush_gfx_cs.
-        */
-       *sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
+   if (!si_compute_prim_discard_enabled(sctx))
+      return;
+
+   if (!sctx->barrier_buf) {
+      u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4, &sctx->barrier_buf_offset,
+                           (struct pipe_resource **)&sctx->barrier_buf);
+   }
+
+   /* Emit a placeholder to signal the next compute IB to start.
+    * See si_compute_prim_discard.c for explanation.
+    */
+   uint32_t signal = 1;
+   si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset, 4, V_370_MEM, V_370_ME,
+                    &signal);
+
+   sctx->last_pkt3_write_data = &sctx->gfx_cs->current.buf[sctx->gfx_cs->current.cdw - 5];
+
+   /* Only the last occurence of WRITE_DATA will be executed.
+    * The packet will be enabled in si_flush_gfx_cs.
+    */
+   *sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
  }
  
  void gfx10_emit_cache_flush(struct si_context *ctx)
  {
-       struct radeon_cmdbuf *cs = ctx->gfx_cs;
-       uint32_t gcr_cntl = 0;
-       unsigned cb_db_event = 0;
-       unsigned flags = ctx->flags;
-
-       if (!ctx->has_graphics) {
-               /* Only process compute flags. */
-               flags &= SI_CONTEXT_INV_ICACHE |
-                        SI_CONTEXT_INV_SCACHE |
-                        SI_CONTEXT_INV_VCACHE |
-                        SI_CONTEXT_INV_L2 |
-                        SI_CONTEXT_WB_L2 |
-                        SI_CONTEXT_INV_L2_METADATA |
-                        SI_CONTEXT_CS_PARTIAL_FLUSH;
-       }
-
-       /* We don't need these. */
-       assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC |
-                         SI_CONTEXT_FLUSH_AND_INV_DB_META)));
-
-       if (flags & SI_CONTEXT_VGT_FLUSH) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
-       }
-
-       if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
-               ctx->num_cb_cache_flushes++;
-       if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
-               ctx->num_db_cache_flushes++;
-
-       if (flags & SI_CONTEXT_INV_ICACHE)
-               gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
-       if (flags & SI_CONTEXT_INV_SCACHE) {
-               /* TODO: When writing to the SMEM L1 cache, we need to set SEQ
-                * to FORWARD when both L1 and L2 are written out (WB or INV).
-                */
-               gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
-       }
-       if (flags & SI_CONTEXT_INV_VCACHE)
-               gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
-
-       /* The L2 cache ops are:
-        * - INV: - invalidate lines that reflect memory (were loaded from memory)
-        *        - don't touch lines that were overwritten (were stored by gfx clients)
-        * - WB: - don't touch lines that reflect memory
-        *       - write back lines that were overwritten
-        * - WB | INV: - invalidate lines that reflect memory
-        *             - write back lines that were overwritten
-        *
-        * GLM doesn't support WB alone. If WB is set, INV must be set too.
-        */
-       if (flags & SI_CONTEXT_INV_L2) {
-               /* Writeback and invalidate everything in L2. */
-               gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) |
-                           S_586_GLM_INV(1) | S_586_GLM_WB(1);
-               ctx->num_L2_invalidates++;
-       } else if (flags & SI_CONTEXT_WB_L2) {
-               gcr_cntl |= S_586_GL2_WB(1) |
-                           S_586_GLM_WB(1) | S_586_GLM_INV(1);
-       } else if (flags & SI_CONTEXT_INV_L2_METADATA) {
-               gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
-       }
-
-       if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
-               if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
-                       /* Flush CMASK/FMASK/DCC. Will wait for idle later. */
-                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-                       radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) |
-                                       EVENT_INDEX(0));
-               }
-               if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
-                       /* Flush HTILE. Will wait for idle later. */
-                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-                       radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) |
-                                       EVENT_INDEX(0));
-               }
-
-               /* First flush CB/DB, then L1/L2. */
-               gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
-
-               if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) ==
-                   (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
-                       cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
-               } else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
-                       cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
-               } else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
-                       cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
-               } else {
-                       assert(0);
-               }
-       } else {
-               /* Wait for graphics shaders to go idle if requested. */
-               if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
-                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-                       radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-                       /* Only count explicit shader flushes, not implicit ones. */
-                       ctx->num_vs_flushes++;
-                       ctx->num_ps_flushes++;
-               } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
-                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-                       radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-                       ctx->num_vs_flushes++;
-               }
-       }
-
-       if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
-               ctx->num_cs_flushes++;
-               ctx->compute_is_busy = false;
-       }
-
-       if (cb_db_event) {
-               /* CB/DB flush and invalidate (or possibly just a wait for a
-                * meta flush) via RELEASE_MEM.
-                *
-                * Combine this with other cache flushes when possible; this
-                * requires affected shaders to be idle, so do it after the
-                * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always
-                * implied).
-                */
-               uint64_t va;
-
-               /* Do the flush (enqueue the event and wait for it). */
-               va = ctx->wait_mem_scratch->gpu_address;
-               ctx->wait_mem_number++;
-
-               /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
-               unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
-               unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
-               unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
-               unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
-               assert(G_586_GL2_US(gcr_cntl) == 0);
-               assert(G_586_GL2_RANGE(gcr_cntl) == 0);
-               assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
-               unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
-               unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
-               unsigned gcr_seq = G_586_SEQ(gcr_cntl);
-
-               gcr_cntl &= C_586_GLM_WB &
-                           C_586_GLM_INV &
-                           C_586_GLV_INV &
-                           C_586_GL1_INV &
-                           C_586_GL2_INV &
-                           C_586_GL2_WB; /* keep SEQ */
-
-               si_cp_release_mem(ctx, cs, cb_db_event,
-                                 S_490_GLM_WB(glm_wb) |
-                                 S_490_GLM_INV(glm_inv) |
-                                 S_490_GLV_INV(glv_inv) |
-                                 S_490_GL1_INV(gl1_inv) |
-                                 S_490_GL2_INV(gl2_inv) |
-                                 S_490_GL2_WB(gl2_wb) |
-                                 S_490_SEQ(gcr_seq),
-                                 EOP_DST_SEL_MEM,
-                                 EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
-                                 EOP_DATA_SEL_VALUE_32BIT,
-                                 ctx->wait_mem_scratch, va,
-                                 ctx->wait_mem_number, SI_NOT_QUERY);
-               si_cp_wait_mem(ctx, ctx->gfx_cs, va, ctx->wait_mem_number, 0xffffffff,
-                              WAIT_REG_MEM_EQUAL);
-       }
-
-       /* Ignore fields that only modify the behavior of other fields. */
-       if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
-               /* Flush caches and wait for the caches to assert idle.
-                * The cache flush is executed in the ME, but the PFP waits
-                * for completion.
-                */
-               radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
-               radeon_emit(cs, 0);             /* CP_COHER_CNTL */
-               radeon_emit(cs, 0xffffffff);    /* CP_COHER_SIZE */
-               radeon_emit(cs, 0xffffff);      /* CP_COHER_SIZE_HI */
-               radeon_emit(cs, 0);             /* CP_COHER_BASE */
-               radeon_emit(cs, 0);             /* CP_COHER_BASE_HI */
-               radeon_emit(cs, 0x0000000A);    /* POLL_INTERVAL */
-               radeon_emit(cs, gcr_cntl);      /* GCR_CNTL */
-       } else if (cb_db_event ||
-                  (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH |
-                            SI_CONTEXT_PS_PARTIAL_FLUSH |
-                            SI_CONTEXT_CS_PARTIAL_FLUSH))) {
-               /* We need to ensure that PFP waits as well. */
-               radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-               radeon_emit(cs, 0);
-       }
-
-       if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
-                               EVENT_INDEX(0));
-       } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) |
-                               EVENT_INDEX(0));
-       }
-
-       ctx->flags = 0;
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+   uint32_t gcr_cntl = 0;
+   unsigned cb_db_event = 0;
+   unsigned flags = ctx->flags;
+
+   if (!ctx->has_graphics) {
+      /* Only process compute flags. */
+      flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+               SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
+               SI_CONTEXT_CS_PARTIAL_FLUSH;
+   }
+
+   /* We don't need these. */
+   assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | SI_CONTEXT_FLUSH_AND_INV_DB_META)));
+
+   if (flags & SI_CONTEXT_VGT_FLUSH) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+   }
+
+   if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
+      ctx->num_cb_cache_flushes++;
+   if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
+      ctx->num_db_cache_flushes++;
+
+   if (flags & SI_CONTEXT_INV_ICACHE)
+      gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
+   if (flags & SI_CONTEXT_INV_SCACHE) {
+      /* TODO: When writing to the SMEM L1 cache, we need to set SEQ
+       * to FORWARD when both L1 and L2 are written out (WB or INV).
+       */
+      gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
+   }
+   if (flags & SI_CONTEXT_INV_VCACHE)
+      gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
+
+   /* The L2 cache ops are:
+    * - INV: - invalidate lines that reflect memory (were loaded from memory)
+    *        - don't touch lines that were overwritten (were stored by gfx clients)
+    * - WB: - don't touch lines that reflect memory
+    *       - write back lines that were overwritten
+    * - WB | INV: - invalidate lines that reflect memory
+    *             - write back lines that were overwritten
+    *
+    * GLM doesn't support WB alone. If WB is set, INV must be set too.
+    */
+   if (flags & SI_CONTEXT_INV_L2) {
+      /* Writeback and invalidate everything in L2. */
+      gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1);
+      ctx->num_L2_invalidates++;
+   } else if (flags & SI_CONTEXT_WB_L2) {
+      gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1);
+   } else if (flags & SI_CONTEXT_INV_L2_METADATA) {
+      gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
+   }
+
+   if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
+      if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+         /* Flush CMASK/FMASK/DCC. Will wait for idle later. */
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
+      }
+      if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
+         /* Flush HTILE. Will wait for idle later. */
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
+      }
+
+      /* First flush CB/DB, then L1/L2. */
+      gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
+
+      if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) ==
+          (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
+         cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
+      } else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+         cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
+      } else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
+         cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
+      } else {
+         assert(0);
+      }
+   } else {
+      /* Wait for graphics shaders to go idle if requested. */
+      if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+         /* Only count explicit shader flushes, not implicit ones. */
+         ctx->num_vs_flushes++;
+         ctx->num_ps_flushes++;
+      } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+         ctx->num_vs_flushes++;
+      }
+   }
+
+   if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
+      ctx->num_cs_flushes++;
+      ctx->compute_is_busy = false;
+   }
+
+   if (cb_db_event) {
+      /* CB/DB flush and invalidate (or possibly just a wait for a
+       * meta flush) via RELEASE_MEM.
+       *
+       * Combine this with other cache flushes when possible; this
+       * requires affected shaders to be idle, so do it after the
+       * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always
+       * implied).
+       */
+      uint64_t va;
+
+      /* Do the flush (enqueue the event and wait for it). */
+      va = ctx->wait_mem_scratch->gpu_address;
+      ctx->wait_mem_number++;
+
+      /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
+      unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
+      unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
+      unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
+      unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
+      assert(G_586_GL2_US(gcr_cntl) == 0);
+      assert(G_586_GL2_RANGE(gcr_cntl) == 0);
+      assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
+      unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
+      unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
+      unsigned gcr_seq = G_586_SEQ(gcr_cntl);
+
+      gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV &
+                  C_586_GL2_WB; /* keep SEQ */
+
+      si_cp_release_mem(ctx, cs, cb_db_event,
+                        S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |
+                           S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |
+                           S_490_SEQ(gcr_seq),
+                        EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
+                        EOP_DATA_SEL_VALUE_32BIT, ctx->wait_mem_scratch, va, ctx->wait_mem_number,
+                        SI_NOT_QUERY);
+      si_cp_wait_mem(ctx, ctx->gfx_cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
+   }
+
+   /* Ignore fields that only modify the behavior of other fields. */
+   if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
+      /* Flush caches and wait for the caches to assert idle.
+       * The cache flush is executed in the ME, but the PFP waits
+       * for completion.
+       */
+      radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
+      radeon_emit(cs, 0);          /* CP_COHER_CNTL */
+      radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+      radeon_emit(cs, 0xffffff);   /* CP_COHER_SIZE_HI */
+      radeon_emit(cs, 0);          /* CP_COHER_BASE */
+      radeon_emit(cs, 0);          /* CP_COHER_BASE_HI */
+      radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+      radeon_emit(cs, gcr_cntl);   /* GCR_CNTL */
+   } else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH |
+                                       SI_CONTEXT_CS_PARTIAL_FLUSH))) {
+      /* We need to ensure that PFP waits as well. */
+      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+      radeon_emit(cs, 0);
+   }
+
+   if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
+   } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
+   }
+
+   ctx->flags = 0;
  }
  
  void si_emit_cache_flush(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       uint32_t flags = sctx->flags;
-
-       if (!sctx->has_graphics) {
-               /* Only process compute flags. */
-               flags &= SI_CONTEXT_INV_ICACHE |
-                        SI_CONTEXT_INV_SCACHE |
-                        SI_CONTEXT_INV_VCACHE |
-                        SI_CONTEXT_INV_L2 |
-                        SI_CONTEXT_WB_L2 |
-                        SI_CONTEXT_INV_L2_METADATA |
-                        SI_CONTEXT_CS_PARTIAL_FLUSH;
-       }
-
-       uint32_t cp_coher_cntl = 0;
-       const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
-                                             SI_CONTEXT_FLUSH_AND_INV_DB);
-       const bool is_barrier = flush_cb_db ||
-                               /* INV_ICACHE == beginning of gfx IB. Checking
-                                * INV_ICACHE fixes corruption for DeusExMD with
-                                * compute-based culling, but I don't know why.
-                                */
-                               flags & (SI_CONTEXT_INV_ICACHE |
-                                        SI_CONTEXT_PS_PARTIAL_FLUSH |
-                                        SI_CONTEXT_VS_PARTIAL_FLUSH) ||
-                               (flags & SI_CONTEXT_CS_PARTIAL_FLUSH &&
-                                sctx->compute_is_busy);
-
-       assert(sctx->chip_class <= GFX9);
-
-       if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
-               sctx->num_cb_cache_flushes++;
-       if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
-               sctx->num_db_cache_flushes++;
-
-       /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
-        * bit is set. An alternative way is to write SQC_CACHES, but that
-        * doesn't seem to work reliably. Since the bug doesn't affect
-        * correctness (it only does more work than necessary) and
-        * the performance impact is likely negligible, there is no plan
-        * to add a workaround for it.
-        */
-
-       if (flags & SI_CONTEXT_INV_ICACHE)
-               cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
-       if (flags & SI_CONTEXT_INV_SCACHE)
-               cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
-
-       if (sctx->chip_class <= GFX8) {
-               if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
-                       cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
-                                        S_0085F0_CB0_DEST_BASE_ENA(1) |
-                                        S_0085F0_CB1_DEST_BASE_ENA(1) |
-                                        S_0085F0_CB2_DEST_BASE_ENA(1) |
-                                        S_0085F0_CB3_DEST_BASE_ENA(1) |
-                                        S_0085F0_CB4_DEST_BASE_ENA(1) |
-                                        S_0085F0_CB5_DEST_BASE_ENA(1) |
-                                        S_0085F0_CB6_DEST_BASE_ENA(1) |
-                                        S_0085F0_CB7_DEST_BASE_ENA(1);
-
-                       /* Necessary for DCC */
-                       if (sctx->chip_class == GFX8)
-                               si_cp_release_mem(sctx, cs,
-                                                 V_028A90_FLUSH_AND_INV_CB_DATA_TS,
-                                                 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-                                                 EOP_DATA_SEL_DISCARD, NULL,
-                                                 0, 0, SI_NOT_QUERY);
-               }
-               if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
-                       cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
-                                        S_0085F0_DB_DEST_BASE_ENA(1);
-       }
-
-       if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
-               /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
-       }
-       if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB |
-                    SI_CONTEXT_FLUSH_AND_INV_DB_META)) {
-               /* Flush HTILE. SURFACE_SYNC will wait for idle. */
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
-       }
-
-       /* Wait for shader engines to go idle.
-        * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
-        * for everything including CB/DB cache flushes.
-        */
-       if (!flush_cb_db) {
-               if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
-                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-                       radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-                       /* Only count explicit shader flushes, not implicit ones
-                        * done by SURFACE_SYNC.
-                        */
-                       sctx->num_vs_flushes++;
-                       sctx->num_ps_flushes++;
-               } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
-                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-                       radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-                       sctx->num_vs_flushes++;
-               }
-       }
-
-       if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH &&
-           sctx->compute_is_busy) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-               sctx->num_cs_flushes++;
-               sctx->compute_is_busy = false;
-       }
-
-       /* VGT state synchronization. */
-       if (flags & SI_CONTEXT_VGT_FLUSH) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
-       }
-       if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
-       }
-
-       /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
-        * wait for idle on GFX9. We have to use a TS event.
-        */
-       if (sctx->chip_class == GFX9 && flush_cb_db) {
-               uint64_t va;
-               unsigned tc_flags, cb_db_event;
-
-               /* Set the CB/DB flush event. */
-               switch (flush_cb_db) {
-               case SI_CONTEXT_FLUSH_AND_INV_CB:
-                       cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
-                       break;
-               case SI_CONTEXT_FLUSH_AND_INV_DB:
-                       cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
-                       break;
-               default:
-                       /* both CB & DB */
-                       cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
-               }
-
-               /* These are the only allowed combinations. If you need to
-                * do multiple operations at once, do them separately.
-                * All operations that invalidate L2 also seem to invalidate
-                * metadata. Volatile (VOL) and WC flushes are not listed here.
-                *
-                * TC    | TC_WB         = writeback & invalidate L2 & L1
-                * TC    | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
-                *         TC_WB | TC_NC = writeback L2 for MTYPE == NC
-                * TC            | TC_NC = invalidate L2 for MTYPE == NC
-                * TC    | TC_MD         = writeback & invalidate L2 metadata (DCC, etc.)
-                * TCL1                  = invalidate L1
-                */
-               tc_flags = 0;
-
-               if (flags & SI_CONTEXT_INV_L2_METADATA) {
-                       tc_flags = EVENT_TC_ACTION_ENA |
-                                  EVENT_TC_MD_ACTION_ENA;
-               }
-
-               /* Ideally flush TC together with CB/DB. */
-               if (flags & SI_CONTEXT_INV_L2) {
-                       /* Writeback and invalidate everything in L2 & L1. */
-                       tc_flags = EVENT_TC_ACTION_ENA |
-                                  EVENT_TC_WB_ACTION_ENA;
-
-                       /* Clear the flags. */
-                       flags &= ~(SI_CONTEXT_INV_L2 |
-                                  SI_CONTEXT_WB_L2 |
-                                  SI_CONTEXT_INV_VCACHE);
-                       sctx->num_L2_invalidates++;
-               }
-
-               /* Do the flush (enqueue the event and wait for it). */
-               va = sctx->wait_mem_scratch->gpu_address;
-               sctx->wait_mem_number++;
-
-               si_cp_release_mem(sctx, cs, cb_db_event, tc_flags,
-                                 EOP_DST_SEL_MEM,
-                                 EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
-                                 EOP_DATA_SEL_VALUE_32BIT,
-                                 sctx->wait_mem_scratch, va,
-                                 sctx->wait_mem_number, SI_NOT_QUERY);
-               si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff,
-                              WAIT_REG_MEM_EQUAL);
-       }
-
-       /* Make sure ME is idle (it executes most packets) before continuing.
-        * This prevents read-after-write hazards between PFP and ME.
-        */
-       if (sctx->has_graphics &&
-           (cp_coher_cntl ||
-            (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      SI_CONTEXT_INV_VCACHE |
-                      SI_CONTEXT_INV_L2 |
-                      SI_CONTEXT_WB_L2)))) {
-               radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-               radeon_emit(cs, 0);
-       }
-
-       /* GFX6-GFX8 only:
-        *   When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
-        *   waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
-        *
-        * cp_coher_cntl should contain all necessary flags except TC flags
-        * at this point.
-        *
-        * GFX6-GFX7 don't support L2 write-back.
-        */
-       if (flags & SI_CONTEXT_INV_L2 ||
-           (sctx->chip_class <= GFX7 &&
-            (flags & SI_CONTEXT_WB_L2))) {
-               /* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
-                * WB must be set on GFX8+ when TC_ACTION is set.
-                */
-               si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
-                                    S_0085F0_TC_ACTION_ENA(1) |
-                                    S_0085F0_TCL1_ACTION_ENA(1) |
-                                    S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8));
-               cp_coher_cntl = 0;
-               sctx->num_L2_invalidates++;
-       } else {
-               /* L1 invalidation and L2 writeback must be done separately,
-                * because both operations can't be done together.
-                */
-               if (flags & SI_CONTEXT_WB_L2) {
-                       /* WB = write-back
-                        * NC = apply to non-coherent MTYPEs
-                        *      (i.e. MTYPE <= 1, which is what we use everywhere)
-                        *
-                        * WB doesn't work without NC.
-                        */
-                       si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
-                                            S_0301F0_TC_WB_ACTION_ENA(1) |
-                                            S_0301F0_TC_NC_ACTION_ENA(1));
-                       cp_coher_cntl = 0;
-                       sctx->num_L2_writebacks++;
-               }
-               if (flags & SI_CONTEXT_INV_VCACHE) {
-                       /* Invalidate per-CU VMEM L1. */
-                       si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
-                                            S_0085F0_TCL1_ACTION_ENA(1));
-                       cp_coher_cntl = 0;
-               }
-       }
-
-       /* If TC flushes haven't cleared this... */
-       if (cp_coher_cntl)
-               si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl);
-
-       if (is_barrier)
-               si_prim_discard_signal_next_compute_ib_start(sctx);
-
-       if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
-                               EVENT_INDEX(0));
-       } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) |
-                               EVENT_INDEX(0));
-       }
-
-       sctx->flags = 0;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   uint32_t flags = sctx->flags;
+
+   if (!sctx->has_graphics) {
+      /* Only process compute flags. */
+      flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+               SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
+               SI_CONTEXT_CS_PARTIAL_FLUSH;
+   }
+
+   uint32_t cp_coher_cntl = 0;
+   const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB);
+   const bool is_barrier =
+      flush_cb_db ||
+      /* INV_ICACHE == beginning of gfx IB. Checking
+       * INV_ICACHE fixes corruption for DeusExMD with
+       * compute-based culling, but I don't know why.
+       */
+      flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) ||
+      (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy);
+
+   assert(sctx->chip_class <= GFX9);
+
+   if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
+      sctx->num_cb_cache_flushes++;
+   if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
+      sctx->num_db_cache_flushes++;
+
+   /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
+    * bit is set. An alternative way is to write SQC_CACHES, but that
+    * doesn't seem to work reliably. Since the bug doesn't affect
+    * correctness (it only does more work than necessary) and
+    * the performance impact is likely negligible, there is no plan
+    * to add a workaround for it.
+    */
+
+   if (flags & SI_CONTEXT_INV_ICACHE)
+      cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
+   if (flags & SI_CONTEXT_INV_SCACHE)
+      cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
+
+   if (sctx->chip_class <= GFX8) {
+      if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+         cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) |
+                          S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) |
+                          S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) |
+                          S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) |
+                          S_0085F0_CB7_DEST_BASE_ENA(1);
+
+         /* Necessary for DCC */
+         if (sctx->chip_class == GFX8)
+            si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM,
+                              EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY);
+      }
+      if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
+         cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);
+   }
+
+   if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+      /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
+   }
+   if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META)) {
+      /* Flush HTILE. SURFACE_SYNC will wait for idle. */
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
+   }
+
+   /* Wait for shader engines to go idle.
+    * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
+    * for everything including CB/DB cache flushes.
+    */
+   if (!flush_cb_db) {
+      if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+         /* Only count explicit shader flushes, not implicit ones
+          * done by SURFACE_SYNC.
+          */
+         sctx->num_vs_flushes++;
+         sctx->num_ps_flushes++;
+      } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+         sctx->num_vs_flushes++;
+      }
+   }
+
+   if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+      sctx->num_cs_flushes++;
+      sctx->compute_is_busy = false;
+   }
+
+   /* VGT state synchronization. */
+   if (flags & SI_CONTEXT_VGT_FLUSH) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+   }
+   if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
+   }
+
+   /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
+    * wait for idle on GFX9. We have to use a TS event.
+    */
+   if (sctx->chip_class == GFX9 && flush_cb_db) {
+      uint64_t va;
+      unsigned tc_flags, cb_db_event;
+
+      /* Set the CB/DB flush event. */
+      switch (flush_cb_db) {
+      case SI_CONTEXT_FLUSH_AND_INV_CB:
+         cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
+         break;
+      case SI_CONTEXT_FLUSH_AND_INV_DB:
+         cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
+         break;
+      default:
+         /* both CB & DB */
+         cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
+      }
+
+      /* These are the only allowed combinations. If you need to
+       * do multiple operations at once, do them separately.
+       * All operations that invalidate L2 also seem to invalidate
+       * metadata. Volatile (VOL) and WC flushes are not listed here.
+       *
+       * TC    | TC_WB         = writeback & invalidate L2 & L1
+       * TC    | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
+       *         TC_WB | TC_NC = writeback L2 for MTYPE == NC
+       * TC            | TC_NC = invalidate L2 for MTYPE == NC
+       * TC    | TC_MD         = writeback & invalidate L2 metadata (DCC, etc.)
+       * TCL1                  = invalidate L1
+       */
+      tc_flags = 0;
+
+      if (flags & SI_CONTEXT_INV_L2_METADATA) {
+         tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA;
+      }
+
+      /* Ideally flush TC together with CB/DB. */
+      if (flags & SI_CONTEXT_INV_L2) {
+         /* Writeback and invalidate everything in L2 & L1. */
+         tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA;
+
+         /* Clear the flags. */
+         flags &= ~(SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_VCACHE);
+         sctx->num_L2_invalidates++;
+      }
+
+      /* Do the flush (enqueue the event and wait for it). */
+      va = sctx->wait_mem_scratch->gpu_address;
+      sctx->wait_mem_number++;
+
+      si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM,
+                        EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT,
+                        sctx->wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY);
+      si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
+   }
+
+   /* Make sure ME is idle (it executes most packets) before continuing.
+    * This prevents read-after-write hazards between PFP and ME.
+    */
+   if (sctx->has_graphics &&
+       (cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE |
+                                   SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) {
+      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+      radeon_emit(cs, 0);
+   }
+
+   /* GFX6-GFX8 only:
+    *   When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
+    *   waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
+    *
+    * cp_coher_cntl should contain all necessary flags except TC flags
+    * at this point.
+    *
+    * GFX6-GFX7 don't support L2 write-back.
+    */
+   if (flags & SI_CONTEXT_INV_L2 || (sctx->chip_class <= GFX7 && (flags & SI_CONTEXT_WB_L2))) {
+      /* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
+       * WB must be set on GFX8+ when TC_ACTION is set.
+       */
+      si_emit_surface_sync(sctx, sctx->gfx_cs,
+                           cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
+                              S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8));
+      cp_coher_cntl = 0;
+      sctx->num_L2_invalidates++;
+   } else {
+      /* L1 invalidation and L2 writeback must be done separately,
+       * because both operations can't be done together.
+       */
+      if (flags & SI_CONTEXT_WB_L2) {
+         /* WB = write-back
+          * NC = apply to non-coherent MTYPEs
+          *      (i.e. MTYPE <= 1, which is what we use everywhere)
+          *
+          * WB doesn't work without NC.
+          */
+         si_emit_surface_sync(
+            sctx, sctx->gfx_cs,
+            cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
+         cp_coher_cntl = 0;
+         sctx->num_L2_writebacks++;
+      }
+      if (flags & SI_CONTEXT_INV_VCACHE) {
+         /* Invalidate per-CU VMEM L1. */
+         si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1));
+         cp_coher_cntl = 0;
+      }
+   }
+
+   /* If TC flushes haven't cleared this... */
+   if (cp_coher_cntl)
+      si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl);
+
+   if (is_barrier)
+      si_prim_discard_signal_next_compute_ib_start(sctx);
+
+   if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
+   } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
+   }
+
+   sctx->flags = 0;
  }
  
-static void si_get_draw_start_count(struct si_context *sctx,
-                                   const struct pipe_draw_info *info,
-                                   unsigned *start, unsigned *count)
+static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_draw_info *info,
+                                    unsigned *start, unsigned *count)
  {
-       struct pipe_draw_indirect_info *indirect = info->indirect;
-
-       if (indirect) {
-               unsigned indirect_count;
-               struct pipe_transfer *transfer;
-               unsigned begin, end;
-               unsigned map_size;
-               unsigned *data;
-
-               if (indirect->indirect_draw_count) {
-                       data = pipe_buffer_map_range(&sctx->b,
-                                       indirect->indirect_draw_count,
-                                       indirect->indirect_draw_count_offset,
-                                       sizeof(unsigned),
-                                       PIPE_TRANSFER_READ, &transfer);
-
-                       indirect_count = *data;
-
-                       pipe_buffer_unmap(&sctx->b, transfer);
-               } else {
-                       indirect_count = indirect->draw_count;
-               }
-
-               if (!indirect_count) {
-                       *start = *count = 0;
-                       return;
-               }
-
-               map_size = (indirect_count - 1) * indirect->stride + 3 * sizeof(unsigned);
-               data = pipe_buffer_map_range(&sctx->b, indirect->buffer,
-                                            indirect->offset, map_size,
-                                            PIPE_TRANSFER_READ, &transfer);
-
-               begin = UINT_MAX;
-               end = 0;
-
-               for (unsigned i = 0; i < indirect_count; ++i) {
-                       unsigned count = data[0];
-                       unsigned start = data[2];
-
-                       if (count > 0) {
-                               begin = MIN2(begin, start);
-                               end = MAX2(end, start + count);
-                       }
-
-                       data += indirect->stride / sizeof(unsigned);
-               }
-
-               pipe_buffer_unmap(&sctx->b, transfer);
-
-               if (begin < end) {
-                       *start = begin;
-                       *count = end - begin;
-               } else {
-                       *start = *count = 0;
-               }
-       } else {
-               *start = info->start;
-               *count = info->count;
-       }
+   struct pipe_draw_indirect_info *indirect = info->indirect;
+
+   if (indirect) {
+      unsigned indirect_count;
+      struct pipe_transfer *transfer;
+      unsigned begin, end;
+      unsigned map_size;
+      unsigned *data;
+
+      if (indirect->indirect_draw_count) {
+         data = pipe_buffer_map_range(&sctx->b, indirect->indirect_draw_count,
+                                      indirect->indirect_draw_count_offset, sizeof(unsigned),
+                                      PIPE_TRANSFER_READ, &transfer);
+
+         indirect_count = *data;
+
+         pipe_buffer_unmap(&sctx->b, transfer);
+      } else {
+         indirect_count = indirect->draw_count;
+      }
+
+      if (!indirect_count) {
+         *start = *count = 0;
+         return;
+      }
+
+      map_size = (indirect_count - 1) * indirect->stride + 3 * sizeof(unsigned);
+      data = pipe_buffer_map_range(&sctx->b, indirect->buffer, indirect->offset, map_size,
+                                   PIPE_TRANSFER_READ, &transfer);
+
+      begin = UINT_MAX;
+      end = 0;
+
+      for (unsigned i = 0; i < indirect_count; ++i) {
+         unsigned count = data[0];
+         unsigned start = data[2];
+
+         if (count > 0) {
+            begin = MIN2(begin, start);
+            end = MAX2(end, start + count);
+         }
+
+         data += indirect->stride / sizeof(unsigned);
+      }
+
+      pipe_buffer_unmap(&sctx->b, transfer);
+
+      if (begin < end) {
+         *start = begin;
+         *count = end - begin;
+      } else {
+         *start = *count = 0;
+      }
+   } else {
+      *start = info->start;
+      *count = info->count;
+   }
  }
  
  static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
-                              enum pipe_prim_type prim, unsigned instance_count,
-                              bool primitive_restart, unsigned skip_atom_mask)
+                               enum pipe_prim_type prim, unsigned instance_count,
+                               bool primitive_restart, unsigned skip_atom_mask)
  {
-       unsigned num_patches = 0;
+   unsigned num_patches = 0;
  
-       si_emit_rasterizer_prim_state(sctx);
-       if (sctx->tes_shader.cso)
-               si_emit_derived_tess_state(sctx, info, &num_patches);
+   si_emit_rasterizer_prim_state(sctx);
+   if (sctx->tes_shader.cso)
+      si_emit_derived_tess_state(sctx, info, &num_patches);
  
-       /* Emit state atoms. */
-       unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
-       while (mask)
-               sctx->atoms.array[u_bit_scan(&mask)].emit(sctx);
+   /* Emit state atoms. */
+   unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
+   while (mask)
+      sctx->atoms.array[u_bit_scan(&mask)].emit(sctx);
  
-       sctx->dirty_atoms &= skip_atom_mask;
+   sctx->dirty_atoms &= skip_atom_mask;
  
-       /* Emit states. */
-       mask = sctx->dirty_states;
-       while (mask) {
-               unsigned i = u_bit_scan(&mask);
-               struct si_pm4_state *state = sctx->queued.array[i];
+   /* Emit states. */
+   mask = sctx->dirty_states;
+   while (mask) {
+      unsigned i = u_bit_scan(&mask);
+      struct si_pm4_state *state = sctx->queued.array[i];
  
-               if (!state || sctx->emitted.array[i] == state)
-                       continue;
+      if (!state || sctx->emitted.array[i] == state)
+         continue;
  
-               si_pm4_emit(sctx, state);
-               sctx->emitted.array[i] = state;
-       }
-       sctx->dirty_states = 0;
+      si_pm4_emit(sctx, state);
+      sctx->emitted.array[i] = state;
+   }
+   sctx->dirty_states = 0;
  
-       /* Emit draw states. */
-       si_emit_vs_state(sctx, info);
-       si_emit_draw_registers(sctx, info, prim, num_patches, instance_count,
-                              primitive_restart);
+   /* Emit draw states. */
+   si_emit_vs_state(sctx, info);
+   si_emit_draw_registers(sctx, info, prim, num_patches, instance_count, primitive_restart);
  }
  
-static bool
-si_all_vs_resources_read_only(struct si_context *sctx,
-                             struct pipe_resource *indexbuf)
+static bool si_all_vs_resources_read_only(struct si_context *sctx, struct pipe_resource *indexbuf)
  {
-       struct radeon_winsys *ws = sctx->ws;
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-       /* Index buffer. */
-       if (indexbuf &&
-           ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf,
-                                       RADEON_USAGE_WRITE))
-               goto has_write_reference;
-
-       /* Vertex buffers. */
-       struct si_vertex_elements *velems = sctx->vertex_elements;
-       unsigned num_velems = velems->count;
-
-       for (unsigned i = 0; i < num_velems; i++) {
-               if (!((1 << i) & velems->first_vb_use_mask))
-                       continue;
-
-               unsigned vb_index = velems->vertex_buffer_index[i];
-               struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
-               if (!res)
-                       continue;
-
-               if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
-                                               RADEON_USAGE_WRITE))
-                       goto has_write_reference;
-       }
-
-       /* Constant and shader buffers. */
-       struct si_descriptors *buffers =
-               &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
-       for (unsigned i = 0; i < buffers->num_active_slots; i++) {
-               unsigned index = buffers->first_active_slot + i;
-               struct pipe_resource *res =
-                       sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
-               if (!res)
-                       continue;
-
-               if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
-                                               RADEON_USAGE_WRITE))
-                       goto has_write_reference;
-       }
-
-       /* Samplers. */
-       struct si_shader_selector *vs = sctx->vs_shader.cso;
-       if (vs->info.samplers_declared) {
-               unsigned num_samplers = util_last_bit(vs->info.samplers_declared);
-
-               for (unsigned i = 0; i < num_samplers; i++) {
-                       struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
-                       if (!view)
-                               continue;
-
-                       if (ws->cs_is_buffer_referenced(cs,
-                                                       si_resource(view->texture)->buf,
-                                                       RADEON_USAGE_WRITE))
-                               goto has_write_reference;
-               }
-       }
-
-       /* Images. */
-       if (vs->info.images_declared) {
-               unsigned num_images = util_last_bit(vs->info.images_declared);
-
-               for (unsigned i = 0; i < num_images; i++) {
-                       struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
-                       if (!res)
-                               continue;
-
-                       if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
-                                                       RADEON_USAGE_WRITE))
-                               goto has_write_reference;
-               }
-       }
-
-       return true;
+   struct radeon_winsys *ws = sctx->ws;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   /* Index buffer. */
+   if (indexbuf && ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf, RADEON_USAGE_WRITE))
+      goto has_write_reference;
+
+   /* Vertex buffers. */
+   struct si_vertex_elements *velems = sctx->vertex_elements;
+   unsigned num_velems = velems->count;
+
+   for (unsigned i = 0; i < num_velems; i++) {
+      if (!((1 << i) & velems->first_vb_use_mask))
+         continue;
+
+      unsigned vb_index = velems->vertex_buffer_index[i];
+      struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
+      if (!res)
+         continue;
+
+      if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
+         goto has_write_reference;
+   }
+
+   /* Constant and shader buffers. */
+   struct si_descriptors *buffers =
+      &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
+   for (unsigned i = 0; i < buffers->num_active_slots; i++) {
+      unsigned index = buffers->first_active_slot + i;
+      struct pipe_resource *res = sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
+      if (!res)
+         continue;
+
+      if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
+         goto has_write_reference;
+   }
+
+   /* Samplers. */
+   struct si_shader_selector *vs = sctx->vs_shader.cso;
+   if (vs->info.samplers_declared) {
+      unsigned num_samplers = util_last_bit(vs->info.samplers_declared);
+
+      for (unsigned i = 0; i < num_samplers; i++) {
+         struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
+         if (!view)
+            continue;
+
+         if (ws->cs_is_buffer_referenced(cs, si_resource(view->texture)->buf, RADEON_USAGE_WRITE))
+            goto has_write_reference;
+      }
+   }
+
+   /* Images. */
+   if (vs->info.images_declared) {
+      unsigned num_images = util_last_bit(vs->info.images_declared);
+
+      for (unsigned i = 0; i < num_images; i++) {
+         struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
+         if (!res)
+            continue;
+
+         if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
+            goto has_write_reference;
+      }
+   }
+
+   return true;
  
  has_write_reference:
-       /* If the current gfx IB has enough packets, flush it to remove write
-        * references to buffers.
-        */
-       if (cs->prev_dw + cs->current.cdw > 2048) {
-               si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-               assert(si_all_vs_resources_read_only(sctx, indexbuf));
-               return true;
-       }
-       return false;
+   /* If the current gfx IB has enough packets, flush it to remove write
+    * references to buffers.
+    */
+   if (cs->prev_dw + cs->current.cdw > 2048) {
+      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+      assert(si_all_vs_resources_read_only(sctx, indexbuf));
+      return true;
+   }
+   return false;
  }
  
  static ALWAYS_INLINE bool pd_msg(const char *s)
  {
-       if (SI_PRIM_DISCARD_DEBUG)
-               printf("PD failed: %s\n", s);
-       return false;
+   if (SI_PRIM_DISCARD_DEBUG)
+      printf("PD failed: %s\n", s);
+   return false;
  }
  
  static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-       struct pipe_resource *indexbuf = info->index.resource;
-       unsigned dirty_tex_counter, dirty_buf_counter;
-       enum pipe_prim_type rast_prim, prim = info->mode;
-       unsigned index_size = info->index_size;
-       unsigned index_offset = info->indirect ? info->start * index_size : 0;
-       unsigned instance_count = info->instance_count;
-       bool primitive_restart = info->primitive_restart &&
-                                (!sctx->screen->options.prim_restart_tri_strips_only ||
-                                 (prim != PIPE_PRIM_TRIANGLE_STRIP &&
-                                  prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));
-
-       if (likely(!info->indirect)) {
-               /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
-                * no workaround for indirect draws, but we can at least skip
-                * direct draws.
-                */
-               if (unlikely(!instance_count))
-                       return;
-
-               /* Handle count == 0. */
-               if (unlikely(!info->count &&
-                            (index_size || !info->count_from_stream_output)))
-                       return;
-       }
-
-       struct si_shader_selector *vs = sctx->vs_shader.cso;
-       if (unlikely(!vs ||
-                    sctx->num_vertex_elements < vs->num_vs_inputs ||
-                    (!sctx->ps_shader.cso && !rs->rasterizer_discard) ||
-                    (!!sctx->tes_shader.cso != (prim == PIPE_PRIM_PATCHES)))) {
-               assert(0);
-               return;
-       }
-
-       /* Recompute and re-emit the texture resource states if needed. */
-       dirty_tex_counter = p_atomic_read(&sctx->screen->dirty_tex_counter);
-       if (unlikely(dirty_tex_counter != sctx->last_dirty_tex_counter)) {
-               sctx->last_dirty_tex_counter = dirty_tex_counter;
-               sctx->framebuffer.dirty_cbufs |=
-                       ((1 << sctx->framebuffer.state.nr_cbufs) - 1);
-               sctx->framebuffer.dirty_zsbuf = true;
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
-               si_update_all_texture_descriptors(sctx);
-       }
-
-       dirty_buf_counter = p_atomic_read(&sctx->screen->dirty_buf_counter);
-       if (unlikely(dirty_buf_counter != sctx->last_dirty_buf_counter)) {
-               sctx->last_dirty_buf_counter = dirty_buf_counter;
-               /* Rebind all buffers unconditionally. */
-               si_rebind_buffer(sctx, NULL);
-       }
-
-       si_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));
-
-       /* Set the rasterization primitive type.
-        *
-        * This must be done after si_decompress_textures, which can call
-        * draw_vbo recursively, and before si_update_shaders, which uses
-        * current_rast_prim for this draw_vbo call. */
-       if (sctx->gs_shader.cso) {
-               /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
-               rast_prim = sctx->gs_shader.cso->rast_prim;
-       } else if (sctx->tes_shader.cso) {
-               /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
-               rast_prim = sctx->tes_shader.cso->rast_prim;
-       } else if (util_rast_prim_is_triangles(prim)) {
-               rast_prim = PIPE_PRIM_TRIANGLES;
-       } else {
-               /* Only possibilities, POINTS, LINE*, RECTANGLES */
-               rast_prim = prim;
-       }
-
-       if (rast_prim != sctx->current_rast_prim) {
-               if (util_prim_is_points_or_lines(sctx->current_rast_prim) !=
-                   util_prim_is_points_or_lines(rast_prim))
-                       si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
-
-               sctx->current_rast_prim = rast_prim;
-               sctx->do_update_shaders = true;
-       }
-
-       if (sctx->tes_shader.cso &&
-           sctx->screen->info.has_ls_vgpr_init_bug) {
-               /* Determine whether the LS VGPR fix should be applied.
-                *
-                * It is only required when num input CPs > num output CPs,
-                * which cannot happen with the fixed function TCS. We should
-                * also update this bit when switching from TCS to fixed
-                * function TCS.
-                */
-               struct si_shader_selector *tcs = sctx->tcs_shader.cso;
-               bool ls_vgpr_fix =
-                       tcs &&
-                       info->vertices_per_patch >
-                       tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
-
-               if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
-                       sctx->ls_vgpr_fix = ls_vgpr_fix;
-                       sctx->do_update_shaders = true;
-               }
-       }
-
-       if (sctx->chip_class <= GFX9 && sctx->gs_shader.cso) {
-               /* Determine whether the GS triangle strip adjacency fix should
-                * be applied. Rotate every other triangle if
-                * - triangle strips with adjacency are fed to the GS and
-                * - primitive restart is disabled (the rotation doesn't help
-                *   when the restart occurs after an odd number of triangles).
-                */
-               bool gs_tri_strip_adj_fix =
-                       !sctx->tes_shader.cso &&
-                       prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY &&
-                       !primitive_restart;
-
-               if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
-                       sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
-                       sctx->do_update_shaders = true;
-               }
-       }
-
-       if (index_size) {
-               /* Translate or upload, if needed. */
-               /* 8-bit indices are supported on GFX8. */
-               if (sctx->chip_class <= GFX7 && index_size == 1) {
-                       unsigned start, count, start_offset, size, offset;
-                       void *ptr;
-
-                       si_get_draw_start_count(sctx, info, &start, &count);
-                       start_offset = start * 2;
-                       size = count * 2;
-
-                       indexbuf = NULL;
-                       u_upload_alloc(ctx->stream_uploader, start_offset,
-                                      size,
-                                      si_optimal_tcc_alignment(sctx, size),
-                                      &offset, &indexbuf, &ptr);
-                       if (!indexbuf)
-                               return;
-
-                       util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0,
-                                                          index_offset + start,
-                                                          count, ptr);
-
-                       /* info->start will be added by the drawing code */
-                       index_offset = offset - start_offset;
-                       index_size = 2;
-               } else if (info->has_user_indices) {
-                       unsigned start_offset;
-
-                       assert(!info->indirect);
-                       start_offset = info->start * index_size;
-
-                       indexbuf = NULL;
-                       u_upload_data(ctx->stream_uploader, start_offset,
-                                     info->count * index_size,
-                                     sctx->screen->info.tcc_cache_line_size,
-                                     (char*)info->index.user + start_offset,
-                                     &index_offset, &indexbuf);
-                       if (!indexbuf)
-                               return;
-
-                       /* info->start will be added by the drawing code */
-                       index_offset -= start_offset;
-               } else if (sctx->chip_class <= GFX7 &&
-                          si_resource(indexbuf)->TC_L2_dirty) {
-                       /* GFX8 reads index buffers through TC L2, so it doesn't
-                        * need this. */
-                       sctx->flags |= SI_CONTEXT_WB_L2;
-                       si_resource(indexbuf)->TC_L2_dirty = false;
-               }
-       }
-
-       bool dispatch_prim_discard_cs = false;
-       bool prim_discard_cs_instancing = false;
-       unsigned original_index_size = index_size;
-       unsigned direct_count = 0;
-
-       if (info->indirect) {
-               struct pipe_draw_indirect_info *indirect = info->indirect;
-
-               /* Add the buffer size for memory checking in need_cs_space. */
-               si_context_add_resource_size(sctx, indirect->buffer);
-
-               /* Indirect buffers use TC L2 on GFX9, but not older hw. */
-               if (sctx->chip_class <= GFX8) {
-                       if (si_resource(indirect->buffer)->TC_L2_dirty) {
-                               sctx->flags |= SI_CONTEXT_WB_L2;
-                               si_resource(indirect->buffer)->TC_L2_dirty = false;
-                       }
-
-                       if (indirect->indirect_draw_count &&
-                           si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
-                               sctx->flags |= SI_CONTEXT_WB_L2;
-                               si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
-                       }
-               }
-       } else {
-               /* Multiply by 3 for strips and fans to get an approximate vertex
-                * count as triangles. */
-               direct_count = info->count * instance_count *
-                              (prim == PIPE_PRIM_TRIANGLES ? 1 : 3);
-       }
-
-       /* Determine if we can use the primitive discard compute shader. */
-       if (si_compute_prim_discard_enabled(sctx) &&
-           (direct_count > sctx->prim_discard_vertex_count_threshold ?
-            (sctx->compute_num_verts_rejected += direct_count, true) : /* Add, then return true. */
-            (sctx->compute_num_verts_ineligible += direct_count, false)) && /* Add, then return false. */
-           (!info->count_from_stream_output || pd_msg("draw_opaque")) &&
-           (primitive_restart ?
-            /* Supported prim types with primitive restart: */
-            (prim == PIPE_PRIM_TRIANGLE_STRIP || pd_msg("bad prim type with primitive restart")) &&
-            /* Disallow instancing with primitive restart: */
-            (instance_count == 1 || pd_msg("instance_count > 1 with primitive restart")) :
-            /* Supported prim types without primitive restart + allow instancing: */
-            (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
-                           (1 << PIPE_PRIM_TRIANGLE_STRIP) |
-                           (1 << PIPE_PRIM_TRIANGLE_FAN)) &&
-            /* Instancing is limited to 16-bit indices, because InstanceID is packed into VertexID. */
-            /* TODO: DrawArraysInstanced doesn't sometimes work, so it's disabled. */
-            (instance_count == 1 ||
-             (instance_count <= USHRT_MAX && index_size && index_size <= 2) ||
-             pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced"))) &&
-           (info->drawid == 0 || !sctx->vs_shader.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
-           (!sctx->render_cond || pd_msg("render condition")) &&
-           /* Forced enablement ignores pipeline statistics queries. */
-           (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
-            (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
-            pd_msg("pipestat or primgen query")) &&
-           (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
-           (!sctx->tes_shader.cso || pd_msg("uses tess")) &&
-           (!sctx->gs_shader.cso || pd_msg("uses GS")) &&
-           (!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
-           !rs->polygon_mode_enabled &&
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   struct pipe_resource *indexbuf = info->index.resource;
+   unsigned dirty_tex_counter, dirty_buf_counter;
+   enum pipe_prim_type rast_prim, prim = info->mode;
+   unsigned index_size = info->index_size;
+   unsigned index_offset = info->indirect ? info->start * index_size : 0;
+   unsigned instance_count = info->instance_count;
+   bool primitive_restart =
+      info->primitive_restart &&
+      (!sctx->screen->options.prim_restart_tri_strips_only ||
+       (prim != PIPE_PRIM_TRIANGLE_STRIP && prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));
+
+   if (likely(!info->indirect)) {
+      /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
+       * no workaround for indirect draws, but we can at least skip
+       * direct draws.
+       */
+      if (unlikely(!instance_count))
+         return;
+
+      /* Handle count == 0. */
+      if (unlikely(!info->count && (index_size || !info->count_from_stream_output)))
+         return;
+   }
+
+   struct si_shader_selector *vs = sctx->vs_shader.cso;
+   if (unlikely(!vs || sctx->num_vertex_elements < vs->num_vs_inputs ||
+                (!sctx->ps_shader.cso && !rs->rasterizer_discard) ||
+                (!!sctx->tes_shader.cso != (prim == PIPE_PRIM_PATCHES)))) {
+      assert(0);
+      return;
+   }
+
+   /* Recompute and re-emit the texture resource states if needed. */
+   dirty_tex_counter = p_atomic_read(&sctx->screen->dirty_tex_counter);
+   if (unlikely(dirty_tex_counter != sctx->last_dirty_tex_counter)) {
+      sctx->last_dirty_tex_counter = dirty_tex_counter;
+      sctx->framebuffer.dirty_cbufs |= ((1 << sctx->framebuffer.state.nr_cbufs) - 1);
+      sctx->framebuffer.dirty_zsbuf = true;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+      si_update_all_texture_descriptors(sctx);
+   }
+
+   dirty_buf_counter = p_atomic_read(&sctx->screen->dirty_buf_counter);
+   if (unlikely(dirty_buf_counter != sctx->last_dirty_buf_counter)) {
+      sctx->last_dirty_buf_counter = dirty_buf_counter;
+      /* Rebind all buffers unconditionally. */
+      si_rebind_buffer(sctx, NULL);
+   }
+
+   si_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));
+
+   /* Set the rasterization primitive type.
+    *
+    * This must be done after si_decompress_textures, which can call
+    * draw_vbo recursively, and before si_update_shaders, which uses
+    * current_rast_prim for this draw_vbo call. */
+   if (sctx->gs_shader.cso) {
+      /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
+      rast_prim = sctx->gs_shader.cso->rast_prim;
+   } else if (sctx->tes_shader.cso) {
+      /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
+      rast_prim = sctx->tes_shader.cso->rast_prim;
+   } else if (util_rast_prim_is_triangles(prim)) {
+      rast_prim = PIPE_PRIM_TRIANGLES;
+   } else {
+      /* Only possibilities, POINTS, LINE*, RECTANGLES */
+      rast_prim = prim;
+   }
+
+   if (rast_prim != sctx->current_rast_prim) {
+      if (util_prim_is_points_or_lines(sctx->current_rast_prim) !=
+          util_prim_is_points_or_lines(rast_prim))
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
+
+      sctx->current_rast_prim = rast_prim;
+      sctx->do_update_shaders = true;
+   }
+
+   if (sctx->tes_shader.cso && sctx->screen->info.has_ls_vgpr_init_bug) {
+      /* Determine whether the LS VGPR fix should be applied.
+       *
+       * It is only required when num input CPs > num output CPs,
+       * which cannot happen with the fixed function TCS. We should
+       * also update this bit when switching from TCS to fixed
+       * function TCS.
+       */
+      struct si_shader_selector *tcs = sctx->tcs_shader.cso;
+      bool ls_vgpr_fix =
+         tcs && info->vertices_per_patch > tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+
+      if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
+         sctx->ls_vgpr_fix = ls_vgpr_fix;
+         sctx->do_update_shaders = true;
+      }
+   }
+
+   if (sctx->chip_class <= GFX9 && sctx->gs_shader.cso) {
+      /* Determine whether the GS triangle strip adjacency fix should
+       * be applied. Rotate every other triangle if
+       * - triangle strips with adjacency are fed to the GS and
+       * - primitive restart is disabled (the rotation doesn't help
+       *   when the restart occurs after an odd number of triangles).
+       */
+      bool gs_tri_strip_adj_fix =
+         !sctx->tes_shader.cso && prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY && !primitive_restart;
+
+      if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
+         sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
+         sctx->do_update_shaders = true;
+      }
+   }
+
+   if (index_size) {
+      /* Translate or upload, if needed. */
+      /* 8-bit indices are supported on GFX8. */
+      if (sctx->chip_class <= GFX7 && index_size == 1) {
+         unsigned start, count, start_offset, size, offset;
+         void *ptr;
+
+         si_get_draw_start_count(sctx, info, &start, &count);
+         start_offset = start * 2;
+         size = count * 2;
+
+         indexbuf = NULL;
+         u_upload_alloc(ctx->stream_uploader, start_offset, size,
+                        si_optimal_tcc_alignment(sctx, size), &offset, &indexbuf, &ptr);
+         if (!indexbuf)
+            return;
+
+         util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0, index_offset + start, count, ptr);
+
+         /* info->start will be added by the drawing code */
+         index_offset = offset - start_offset;
+         index_size = 2;
+      } else if (info->has_user_indices) {
+         unsigned start_offset;
+
+         assert(!info->indirect);
+         start_offset = info->start * index_size;
+
+         indexbuf = NULL;
+         u_upload_data(ctx->stream_uploader, start_offset, info->count * index_size,
+                       sctx->screen->info.tcc_cache_line_size,
+                       (char *)info->index.user + start_offset, &index_offset, &indexbuf);
+         if (!indexbuf)
+            return;
+
+         /* info->start will be added by the drawing code */
+         index_offset -= start_offset;
+      } else if (sctx->chip_class <= GFX7 && si_resource(indexbuf)->TC_L2_dirty) {
+         /* GFX8 reads index buffers through TC L2, so it doesn't
+          * need this. */
+         sctx->flags |= SI_CONTEXT_WB_L2;
+         si_resource(indexbuf)->TC_L2_dirty = false;
+      }
+   }
+
+   bool dispatch_prim_discard_cs = false;
+   bool prim_discard_cs_instancing = false;
+   unsigned original_index_size = index_size;
+   unsigned direct_count = 0;
+
+   if (info->indirect) {
+      struct pipe_draw_indirect_info *indirect = info->indirect;
+
+      /* Add the buffer size for memory checking in need_cs_space. */
+      si_context_add_resource_size(sctx, indirect->buffer);
+
+      /* Indirect buffers use TC L2 on GFX9, but not older hw. */
+      if (sctx->chip_class <= GFX8) {
+         if (si_resource(indirect->buffer)->TC_L2_dirty) {
+            sctx->flags |= SI_CONTEXT_WB_L2;
+            si_resource(indirect->buffer)->TC_L2_dirty = false;
+         }
+
+         if (indirect->indirect_draw_count &&
+             si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
+            sctx->flags |= SI_CONTEXT_WB_L2;
+            si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
+         }
+      }
+   } else {
+      /* Multiply by 3 for strips and fans to get an approximate vertex
+       * count as triangles. */
+      direct_count = info->count * instance_count * (prim == PIPE_PRIM_TRIANGLES ? 1 : 3);
+   }
+
+   /* Determine if we can use the primitive discard compute shader. */
+   if (si_compute_prim_discard_enabled(sctx) &&
+       (direct_count > sctx->prim_discard_vertex_count_threshold
+           ? (sctx->compute_num_verts_rejected += direct_count, true)
+           : /* Add, then return true. */
+           (sctx->compute_num_verts_ineligible += direct_count,
+            false)) && /* Add, then return false. */
+       (!info->count_from_stream_output || pd_msg("draw_opaque")) &&
+       (primitive_restart ?
+                          /* Supported prim types with primitive restart: */
+           (prim == PIPE_PRIM_TRIANGLE_STRIP || pd_msg("bad prim type with primitive restart")) &&
+              /* Disallow instancing with primitive restart: */
+              (instance_count == 1 || pd_msg("instance_count > 1 with primitive restart"))
+                          :
+                          /* Supported prim types without primitive restart + allow instancing: */
+           (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) |
+                          (1 << PIPE_PRIM_TRIANGLE_FAN)) &&
+              /* Instancing is limited to 16-bit indices, because InstanceID is packed into
+                 VertexID. */
+              /* TODO: DrawArraysInstanced doesn't sometimes work, so it's disabled. */
+              (instance_count == 1 ||
+               (instance_count <= USHRT_MAX && index_size && index_size <= 2) ||
+               pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced"))) &&
+       (info->drawid == 0 || !sctx->vs_shader.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
+       (!sctx->render_cond || pd_msg("render condition")) &&
+       /* Forced enablement ignores pipeline statistics queries. */
+       (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
+        (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
+        pd_msg("pipestat or primgen query")) &&
+       (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
+       (!sctx->tes_shader.cso || pd_msg("uses tess")) &&
+       (!sctx->gs_shader.cso || pd_msg("uses GS")) &&
+       (!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
+       !rs->polygon_mode_enabled &&
  #if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */
-           (!sctx->vs_shader.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
-           (!sctx->vs_shader.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
-           (!sctx->vs_shader.cso->info.writes_memory || pd_msg("writes memory")) &&
-           (!sctx->vs_shader.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
-           !sctx->vs_shader.cso->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
-           !sctx->vs_shader.cso->so.num_outputs &&
+       (!sctx->vs_shader.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
+       (!sctx->vs_shader.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
+       (!sctx->vs_shader.cso->info.writes_memory || pd_msg("writes memory")) &&
+       (!sctx->vs_shader.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
+       !sctx->vs_shader.cso->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
+       !sctx->vs_shader.cso->so.num_outputs &&
  #else
-           (sctx->vs_shader.cso->prim_discard_cs_allowed || pd_msg("VS shader uses unsupported features")) &&
+       (sctx->vs_shader.cso->prim_discard_cs_allowed ||
+        pd_msg("VS shader uses unsupported features")) &&
  #endif
-           /* Check that all buffers are used for read only, because compute
-            * dispatches can run ahead. */
-           (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) || pd_msg("write reference"))) {
-               switch (si_prepare_prim_discard_or_split_draw(sctx, info, primitive_restart)) {
-               case SI_PRIM_DISCARD_ENABLED:
-                       original_index_size = index_size;
-                       prim_discard_cs_instancing = instance_count > 1;
-                       dispatch_prim_discard_cs = true;
-
-                       /* The compute shader changes/lowers the following: */
-                       prim = PIPE_PRIM_TRIANGLES;
-                       index_size = 4;
-                       instance_count = 1;
-                       primitive_restart = false;
-                       sctx->compute_num_verts_rejected -= direct_count;
-                       sctx->compute_num_verts_accepted += direct_count;
-                       break;
-               case SI_PRIM_DISCARD_DISABLED:
-                       break;
-               case SI_PRIM_DISCARD_DRAW_SPLIT:
-                       sctx->compute_num_verts_rejected -= direct_count;
-                       goto return_cleanup;
-               }
-       }
-
-       if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
-               sctx->prim_discard_cs_instancing = prim_discard_cs_instancing;
-               sctx->do_update_shaders = true;
-       }
-
-       /* Update NGG culling settings. */
-       if (sctx->ngg &&
-           !dispatch_prim_discard_cs &&
-           rast_prim == PIPE_PRIM_TRIANGLES &&
-           (sctx->screen->always_use_ngg_culling ||
-            /* At least 1024 non-indexed vertices (8 subgroups) are needed
-             * per draw call (no TES/GS) to enable NGG culling.
-             */
-            (!index_size && direct_count >= 1024 &&
-             (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) &&
-             !sctx->tes_shader.cso && !sctx->gs_shader.cso)) &&
-           si_get_vs(sctx)->cso->ngg_culling_allowed) {
-               unsigned ngg_culling = 0;
-
-               if (rs->rasterizer_discard) {
-                       ngg_culling |= SI_NGG_CULL_FRONT_FACE |
-                                      SI_NGG_CULL_BACK_FACE;
-               } else {
-                       /* Polygon mode can't use view and small primitive culling,
-                        * because it draws points or lines where the culling depends
-                        * on the point or line width.
-                        */
-                       if (!rs->polygon_mode_enabled)
-                               ngg_culling |= SI_NGG_CULL_VIEW_SMALLPRIMS;
-
-                       if (sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front)
-                               ngg_culling |= SI_NGG_CULL_FRONT_FACE;
-                       if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back)
-                               ngg_culling |= SI_NGG_CULL_BACK_FACE;
-               }
-
-               /* Use NGG fast launch for certain non-indexed primitive types.
-                * A draw must have at least 1 full primitive.
-                */
-               if (ngg_culling && !index_size && direct_count >= 3 &&
-                   !sctx->tes_shader.cso && !sctx->gs_shader.cso) {
-                       if (prim == PIPE_PRIM_TRIANGLES)
-                               ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
-                       else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
-                               ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;
-               }
-
-               if (ngg_culling != sctx->ngg_culling) {
-                       /* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs.
-                       * See issues #2418, #2426, #2434
-                       */
-                       if (ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
-                               sctx->flags |= SI_CONTEXT_VGT_FLUSH;
-                       sctx->ngg_culling = ngg_culling;
-                       sctx->do_update_shaders = true;
-               }
-       } else if (sctx->ngg_culling) {
-               sctx->ngg_culling = false;
-               sctx->do_update_shaders = true;
-       }
-
-       if (sctx->do_update_shaders && !si_update_shaders(sctx))
-               goto return_cleanup;
-
-       si_need_gfx_cs_space(sctx);
-
-       if (sctx->bo_list_add_all_gfx_resources)
-               si_gfx_resources_add_all_to_bo_list(sctx);
-
-       /* Since we've called si_context_add_resource_size for vertex buffers,
-        * this must be called after si_need_cs_space, because we must let
-        * need_cs_space flush before we add buffers to the buffer list.
-        */
-       if (!si_upload_vertex_buffer_descriptors(sctx))
-               goto return_cleanup;
-
-       /* Vega10/Raven scissor bug workaround. When any context register is
-        * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
-        * registers must be written too.
-        */
-       unsigned masked_atoms = 0;
-
-       if (sctx->screen->info.has_gfx9_scissor_bug) {
-               masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
-
-               if (info->count_from_stream_output ||
-                   sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
-                   sctx->dirty_states & si_states_that_always_roll_context())
-                       sctx->context_roll = true;
-       }
-
-       /* Use optimal packet order based on whether we need to sync the pipeline. */
-       if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
-                                     SI_CONTEXT_FLUSH_AND_INV_DB |
-                                     SI_CONTEXT_PS_PARTIAL_FLUSH |
-                                     SI_CONTEXT_CS_PARTIAL_FLUSH))) {
-               /* If we have to wait for idle, set all states first, so that all
-                * SET packets are processed in parallel with previous draw calls.
-                * Then draw and prefetch at the end. This ensures that the time
-                * the CUs are idle is very short.
-                */
-               if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND))
-                       masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
-
-               if (!si_upload_graphics_shader_descriptors(sctx))
-                       goto return_cleanup;
-
-               /* Emit all states except possibly render condition. */
-               si_emit_all_states(sctx, info, prim, instance_count,
-                                  primitive_restart, masked_atoms);
-               sctx->emit_cache_flush(sctx);
-               /* <-- CUs are idle here. */
-
-               if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond))
-                       sctx->atoms.s.render_cond.emit(sctx);
-
-               if (sctx->screen->info.has_gfx9_scissor_bug &&
-                   (sctx->context_roll ||
-                    si_is_atom_dirty(sctx, &sctx->atoms.s.scissors)))
-                       sctx->atoms.s.scissors.emit(sctx);
-
-               sctx->dirty_atoms = 0;
-
-               si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset,
-                                    instance_count, dispatch_prim_discard_cs,
-                                    original_index_size);
-               /* <-- CUs are busy here. */
-
-               /* Start prefetches after the draw has been started. Both will run
-                * in parallel, but starting the draw first is more important.
-                */
-               if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
-                       cik_emit_prefetch_L2(sctx, false);
-       } else {
-               /* If we don't wait for idle, start prefetches first, then set
-                * states, and draw at the end.
-                */
-               if (sctx->flags)
-                       sctx->emit_cache_flush(sctx);
-
-               /* Only prefetch the API VS and VBO descriptors. */
-               if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
-                       cik_emit_prefetch_L2(sctx, true);
-
-               if (!si_upload_graphics_shader_descriptors(sctx))
-                       goto return_cleanup;
-
-               si_emit_all_states(sctx, info, prim, instance_count,
-                                  primitive_restart, masked_atoms);
-
-               if (sctx->screen->info.has_gfx9_scissor_bug &&
-                   (sctx->context_roll ||
-                    si_is_atom_dirty(sctx, &sctx->atoms.s.scissors)))
-                       sctx->atoms.s.scissors.emit(sctx);
-
-               sctx->dirty_atoms = 0;
-
-               si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset,
-                                    instance_count, dispatch_prim_discard_cs,
-                                    original_index_size);
-
-               /* Prefetch the remaining shaders after the draw has been
-                * started. */
-               if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
-                       cik_emit_prefetch_L2(sctx, false);
-       }
-
-       /* Mark the displayable dcc buffer as dirty in order to update
-        * it on the next call to si_flush_resource. */
-       if (sctx->screen->info.use_display_dcc_with_retile_blit) {
-               /* Don't use si_update_fb_dirtiness_after_rendering because it'll
-                * cause unnecessary texture decompressions on each draw. */
-               unsigned displayable_dcc_cb_mask = sctx->framebuffer.displayable_dcc_cb_mask;
-               while (displayable_dcc_cb_mask) {
-                       unsigned i = u_bit_scan(&displayable_dcc_cb_mask);
-                       struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
-                       struct si_texture *tex = (struct si_texture*) surf->texture;
-                       tex->displayable_dcc_dirty = true;
-               }
-       }
-
-       /* Clear the context roll flag after the draw call. */
-       sctx->context_roll = false;
-
-       if (unlikely(sctx->current_saved_cs)) {
-               si_trace_emit(sctx);
-               si_log_draw_state(sctx, sctx->log);
-       }
-
-       /* Workaround for a VGT hang when streamout is enabled.
-        * It must be done after drawing. */
-       if ((sctx->family == CHIP_HAWAII ||
-            sctx->family == CHIP_TONGA ||
-            sctx->family == CHIP_FIJI) &&
-           si_get_strmout_en(sctx)) {
-               sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
-       }
-
-       if (unlikely(sctx->decompression_enabled)) {
-               sctx->num_decompress_calls++;
-       } else {
-               sctx->num_draw_calls++;
-               if (sctx->framebuffer.state.nr_cbufs > 1)
-                       sctx->num_mrt_draw_calls++;
-               if (primitive_restart)
-                       sctx->num_prim_restart_calls++;
-               if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
-                       sctx->num_spill_draw_calls++;
-       }
+       /* Check that all buffers are used for read only, because compute
+        * dispatches can run ahead. */
+       (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) ||
+        pd_msg("write reference"))) {
+      switch (si_prepare_prim_discard_or_split_draw(sctx, info, primitive_restart)) {
+      case SI_PRIM_DISCARD_ENABLED:
+         original_index_size = index_size;
+         prim_discard_cs_instancing = instance_count > 1;
+         dispatch_prim_discard_cs = true;
+
+         /* The compute shader changes/lowers the following: */
+         prim = PIPE_PRIM_TRIANGLES;
+         index_size = 4;
+         instance_count = 1;
+         primitive_restart = false;
+         sctx->compute_num_verts_rejected -= direct_count;
+         sctx->compute_num_verts_accepted += direct_count;
+         break;
+      case SI_PRIM_DISCARD_DISABLED:
+         break;
+      case SI_PRIM_DISCARD_DRAW_SPLIT:
+         sctx->compute_num_verts_rejected -= direct_count;
+         goto return_cleanup;
+      }
+   }
+
+   if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
+      sctx->prim_discard_cs_instancing = prim_discard_cs_instancing;
+      sctx->do_update_shaders = true;
+   }
+
+   /* Update NGG culling settings. */
+   if (sctx->ngg && !dispatch_prim_discard_cs && rast_prim == PIPE_PRIM_TRIANGLES &&
+       (sctx->screen->always_use_ngg_culling ||
+        /* At least 1024 non-indexed vertices (8 subgroups) are needed
+         * per draw call (no TES/GS) to enable NGG culling.
+         */
+        (!index_size && direct_count >= 1024 &&
+         (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) &&
+         !sctx->tes_shader.cso && !sctx->gs_shader.cso)) &&
+       si_get_vs(sctx)->cso->ngg_culling_allowed) {
+      unsigned ngg_culling = 0;
+
+      if (rs->rasterizer_discard) {
+         ngg_culling |= SI_NGG_CULL_FRONT_FACE | SI_NGG_CULL_BACK_FACE;
+      } else {
+         /* Polygon mode can't use view and small primitive culling,
+          * because it draws points or lines where the culling depends
+          * on the point or line width.
+          */
+         if (!rs->polygon_mode_enabled)
+            ngg_culling |= SI_NGG_CULL_VIEW_SMALLPRIMS;
+
+         if (sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front)
+            ngg_culling |= SI_NGG_CULL_FRONT_FACE;
+         if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back)
+            ngg_culling |= SI_NGG_CULL_BACK_FACE;
+      }
+
+      /* Use NGG fast launch for certain non-indexed primitive types.
+       * A draw must have at least 1 full primitive.
+       */
+      if (ngg_culling && !index_size && direct_count >= 3 && !sctx->tes_shader.cso &&
+          !sctx->gs_shader.cso) {
+         if (prim == PIPE_PRIM_TRIANGLES)
+            ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
+         else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
+            ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;
+      }
+
+      if (ngg_culling != sctx->ngg_culling) {
+         /* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs.
+          * See issues #2418, #2426, #2434
+          */
+         if (ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
+            sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+         sctx->ngg_culling = ngg_culling;
+         sctx->do_update_shaders = true;
+      }
+   } else if (sctx->ngg_culling) {
+      sctx->ngg_culling = false;
+      sctx->do_update_shaders = true;
+   }
+
+   if (sctx->do_update_shaders && !si_update_shaders(sctx))
+      goto return_cleanup;
+
+   si_need_gfx_cs_space(sctx);
+
+   if (sctx->bo_list_add_all_gfx_resources)
+      si_gfx_resources_add_all_to_bo_list(sctx);
+
+   /* Since we've called si_context_add_resource_size for vertex buffers,
+    * this must be called after si_need_cs_space, because we must let
+    * need_cs_space flush before we add buffers to the buffer list.
+    */
+   if (!si_upload_vertex_buffer_descriptors(sctx))
+      goto return_cleanup;
+
+   /* Vega10/Raven scissor bug workaround. When any context register is
+    * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
+    * registers must be written too.
+    */
+   unsigned masked_atoms = 0;
+
+   if (sctx->screen->info.has_gfx9_scissor_bug) {
+      masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
+
+      if (info->count_from_stream_output ||
+          sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
+          sctx->dirty_states & si_states_that_always_roll_context())
+         sctx->context_roll = true;
+   }
+
+   /* Use optimal packet order based on whether we need to sync the pipeline. */
+   if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB |
+                               SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH))) {
+      /* If we have to wait for idle, set all states first, so that all
+       * SET packets are processed in parallel with previous draw calls.
+       * Then draw and prefetch at the end. This ensures that the time
+       * the CUs are idle is very short.
+       */
+      if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND))
+         masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
+
+      if (!si_upload_graphics_shader_descriptors(sctx))
+         goto return_cleanup;
+
+      /* Emit all states except possibly render condition. */
+      si_emit_all_states(sctx, info, prim, instance_count, primitive_restart, masked_atoms);
+      sctx->emit_cache_flush(sctx);
+      /* <-- CUs are idle here. */
+
+      if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond))
+         sctx->atoms.s.render_cond.emit(sctx);
+
+      if (sctx->screen->info.has_gfx9_scissor_bug &&
+          (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors)))
+         sctx->atoms.s.scissors.emit(sctx);
+
+      sctx->dirty_atoms = 0;
+
+      si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, instance_count,
+                           dispatch_prim_discard_cs, original_index_size);
+      /* <-- CUs are busy here. */
+
+      /* Start prefetches after the draw has been started. Both will run
+       * in parallel, but starting the draw first is more important.
+       */
+      if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
+         cik_emit_prefetch_L2(sctx, false);
+   } else {
+      /* If we don't wait for idle, start prefetches first, then set
+       * states, and draw at the end.
+       */
+      if (sctx->flags)
+         sctx->emit_cache_flush(sctx);
+
+      /* Only prefetch the API VS and VBO descriptors. */
+      if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
+         cik_emit_prefetch_L2(sctx, true);
+
+      if (!si_upload_graphics_shader_descriptors(sctx))
+         goto return_cleanup;
+
+      si_emit_all_states(sctx, info, prim, instance_count, primitive_restart, masked_atoms);
+
+      if (sctx->screen->info.has_gfx9_scissor_bug &&
+          (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors)))
+         sctx->atoms.s.scissors.emit(sctx);
+
+      sctx->dirty_atoms = 0;
+
+      si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, instance_count,
+                           dispatch_prim_discard_cs, original_index_size);
+
+      /* Prefetch the remaining shaders after the draw has been
+       * started. */
+      if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
+         cik_emit_prefetch_L2(sctx, false);
+   }
+
+   /* Mark the displayable dcc buffer as dirty in order to update
+    * it on the next call to si_flush_resource. */
+   if (sctx->screen->info.use_display_dcc_with_retile_blit) {
+      /* Don't use si_update_fb_dirtiness_after_rendering because it'll
+       * cause unnecessary texture decompressions on each draw. */
+      unsigned displayable_dcc_cb_mask = sctx->framebuffer.displayable_dcc_cb_mask;
+      while (displayable_dcc_cb_mask) {
+         unsigned i = u_bit_scan(&displayable_dcc_cb_mask);
+         struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
+         struct si_texture *tex = (struct si_texture *)surf->texture;
+         tex->displayable_dcc_dirty = true;
+      }
+   }
+
+   /* Clear the context roll flag after the draw call. */
+   sctx->context_roll = false;
+
+   if (unlikely(sctx->current_saved_cs)) {
+      si_trace_emit(sctx);
+      si_log_draw_state(sctx, sctx->log);
+   }
+
+   /* Workaround for a VGT hang when streamout is enabled.
+    * It must be done after drawing. */
+   if ((sctx->family == CHIP_HAWAII || sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI) &&
+       si_get_strmout_en(sctx)) {
+      sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
+   }
+
+   if (unlikely(sctx->decompression_enabled)) {
+      sctx->num_decompress_calls++;
+   } else {
+      sctx->num_draw_calls++;
+      if (sctx->framebuffer.state.nr_cbufs > 1)
+         sctx->num_mrt_draw_calls++;
+      if (primitive_restart)
+         sctx->num_prim_restart_calls++;
+      if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
+         sctx->num_spill_draw_calls++;
+   }
  
  return_cleanup:
-       if (index_size && indexbuf != info->index.resource)
-               pipe_resource_reference(&indexbuf, NULL);
+   if (index_size && indexbuf != info->index.resource)
+      pipe_resource_reference(&indexbuf, NULL);
  }
  
-static void
-si_draw_rectangle(struct blitter_context *blitter,
-                 void *vertex_elements_cso,
-                 blitter_get_vs_func get_vs,
-                 int x1, int y1, int x2, int y2,
-                 float depth, unsigned num_instances,
-                 enum blitter_attrib_type type,
-                 const union blitter_attrib *attrib)
+static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elements_cso,
+                              blitter_get_vs_func get_vs, int x1, int y1, int x2, int y2,
+                              float depth, unsigned num_instances, enum blitter_attrib_type type,
+                              const union blitter_attrib *attrib)
  {
-       struct pipe_context *pipe = util_blitter_get_pipe(blitter);
-       struct si_context *sctx = (struct si_context*)pipe;
-
-       /* Pack position coordinates as signed int16. */
-       sctx->vs_blit_sh_data[0] = (uint32_t)(x1 & 0xffff) |
-                                  ((uint32_t)(y1 & 0xffff) << 16);
-       sctx->vs_blit_sh_data[1] = (uint32_t)(x2 & 0xffff) |
-                                  ((uint32_t)(y2 & 0xffff) << 16);
-       sctx->vs_blit_sh_data[2] = fui(depth);
-
-       switch (type) {
-       case UTIL_BLITTER_ATTRIB_COLOR:
-               memcpy(&sctx->vs_blit_sh_data[3], attrib->color,
-                      sizeof(float)*4);
-               break;
-       case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
-       case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
-               memcpy(&sctx->vs_blit_sh_data[3], &attrib->texcoord,
-                      sizeof(attrib->texcoord));
-               break;
-       case UTIL_BLITTER_ATTRIB_NONE:;
-       }
-
-       pipe->bind_vs_state(pipe, si_get_blitter_vs(sctx, type, num_instances));
-
-       struct pipe_draw_info info = {};
-       info.mode = SI_PRIM_RECTANGLE_LIST;
-       info.count = 3;
-       info.instance_count = num_instances;
-
-       /* Don't set per-stage shader pointers for VS. */
-       sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX);
-       sctx->vertex_buffer_pointer_dirty = false;
-       sctx->vertex_buffer_user_sgprs_dirty = false;
-
-       si_draw_vbo(pipe, &info);
+   struct pipe_context *pipe = util_blitter_get_pipe(blitter);
+   struct si_context *sctx = (struct si_context *)pipe;
+
+   /* Pack position coordinates as signed int16. */
+   sctx->vs_blit_sh_data[0] = (uint32_t)(x1 & 0xffff) | ((uint32_t)(y1 & 0xffff) << 16);
+   sctx->vs_blit_sh_data[1] = (uint32_t)(x2 & 0xffff) | ((uint32_t)(y2 & 0xffff) << 16);
+   sctx->vs_blit_sh_data[2] = fui(depth);
+
+   switch (type) {
+   case UTIL_BLITTER_ATTRIB_COLOR:
+      memcpy(&sctx->vs_blit_sh_data[3], attrib->color, sizeof(float) * 4);
+      break;
+   case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
+   case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
+      memcpy(&sctx->vs_blit_sh_data[3], &attrib->texcoord, sizeof(attrib->texcoord));
+      break;
+   case UTIL_BLITTER_ATTRIB_NONE:;
+   }
+
+   pipe->bind_vs_state(pipe, si_get_blitter_vs(sctx, type, num_instances));
+
+   struct pipe_draw_info info = {};
+   info.mode = SI_PRIM_RECTANGLE_LIST;
+   info.count = 3;
+   info.instance_count = num_instances;
+
+   /* Don't set per-stage shader pointers for VS. */
+   sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX);
+   sctx->vertex_buffer_pointer_dirty = false;
+   sctx->vertex_buffer_user_sgprs_dirty = false;
+
+   si_draw_vbo(pipe, &info);
  }
  
  void si_trace_emit(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       uint32_t trace_id = ++sctx->current_saved_cs->trace_id;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   uint32_t trace_id = ++sctx->current_saved_cs->trace_id;
  
-       si_cp_write_data(sctx, sctx->current_saved_cs->trace_buf,
-                        0, 4, V_370_MEM, V_370_ME, &trace_id);
+   si_cp_write_data(sctx, sctx->current_saved_cs->trace_buf, 0, 4, V_370_MEM, V_370_ME, &trace_id);
  
-       radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-       radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id));
+   radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+   radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id));
  
-       if (sctx->log)
-               u_log_flush(sctx->log);
+   if (sctx->log)
+      u_log_flush(sctx->log);
  }
  
  void si_init_draw_functions(struct si_context *sctx)
  {
-       sctx->b.draw_vbo = si_draw_vbo;
+   sctx->b.draw_vbo = si_draw_vbo;
  
-       sctx->blitter->draw_rectangle = si_draw_rectangle;
+   sctx->blitter->draw_rectangle = si_draw_rectangle;
  
-       si_init_ia_multi_vgt_param_table(sctx);
+   si_init_ia_multi_vgt_param_table(sctx);
  }
diff --git a/src/gallium/drivers/radeonsi/si_state_msaa.c b/src/gallium/drivers/radeonsi/si_state_msaa.c

index 0fa38918b20dd48e93553f0c150b072250b84fa7..9ebb1e5dcb4ddd44609116cd864ec801029cd434 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_msaa.c
+++ b/src/gallium/drivers/radeonsi/si_state_msaa.c
@@ -25,17 +25,16 @@
  #include "si_build_pm4.h"
  
  /* For MSAA sample positions. */
-#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)  \
-       ((((unsigned)(s0x) & 0xf) << 0)  | (((unsigned)(s0y) & 0xf) << 4)  | \
-        (((unsigned)(s1x) & 0xf) << 8)  | (((unsigned)(s1y) & 0xf) << 12) | \
-        (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \
-        (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28))
+#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)                                          \
+   ((((unsigned)(s0x)&0xf) << 0) | (((unsigned)(s0y)&0xf) << 4) | (((unsigned)(s1x)&0xf) << 8) |   \
+    (((unsigned)(s1y)&0xf) << 12) | (((unsigned)(s2x)&0xf) << 16) |                                \
+    (((unsigned)(s2y)&0xf) << 20) | (((unsigned)(s3x)&0xf) << 24) | (((unsigned)(s3y)&0xf) << 28))
  
  /* For obtaining location coordinates from registers */
-#define SEXT4(x)               ((int)((x) | ((x) & 0x8 ? 0xfffffff0 : 0)))
-#define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index) * 4)) & 0xf)
-#define GET_SX(reg, index)     GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
-#define GET_SY(reg, index)     GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
+#define SEXT4(x)               ((int)((x) | ((x)&0x8 ? 0xfffffff0 : 0)))
+#define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index)*4)) & 0xf)
+#define GET_SX(reg, index)     GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
+#define GET_SY(reg, index)     GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
  
  /* The following sample ordering is required by EQAA.
   *
@@ -88,132 +87,128 @@
  
  /* 1x MSAA */
  static const uint32_t sample_locs_1x =
-       FILL_SREG( 0, 0,   0, 0,   0, 0,   0, 0); /* S1, S2, S3 fields are not used by 1x */
+   FILL_SREG(0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */
  static const uint64_t centroid_priority_1x = 0x0000000000000000ull;
  
  /* 2x MSAA (the positions are sorted for EQAA) */
  static const uint32_t sample_locs_2x =
-       FILL_SREG(-4,-4,   4, 4,   0, 0,   0, 0); /* S2 & S3 fields are not used by 2x MSAA */
+   FILL_SREG(-4, -4, 4, 4, 0, 0, 0, 0); /* S2 & S3 fields are not used by 2x MSAA */
  static const uint64_t centroid_priority_2x = 0x1010101010101010ull;
  
  /* 4x MSAA (the positions are sorted for EQAA) */
-static const uint32_t sample_locs_4x =
-       FILL_SREG(-2,-6,   2, 6,   -6, 2,  6,-2);
+static const uint32_t sample_locs_4x = FILL_SREG(-2, -6, 2, 6, -6, 2, 6, -2);
  static const uint64_t centroid_priority_4x = 0x3210321032103210ull;
  
  /* 8x MSAA (the positions are sorted for EQAA) */
  static const uint32_t sample_locs_8x[] = {
-       FILL_SREG(-3,-5,   5, 1,  -1, 3,   7,-7),
-       FILL_SREG(-7,-1,   3, 7,  -5, 5,   1,-3),
-       /* The following are unused by hardware, but we emit them to IBs
-        * instead of multiple SET_CONTEXT_REG packets. */
-       0,
-       0,
+   FILL_SREG(-3, -5, 5, 1, -1, 3, 7, -7),
+   FILL_SREG(-7, -1, 3, 7, -5, 5, 1, -3),
+   /* The following are unused by hardware, but we emit them to IBs
+    * instead of multiple SET_CONTEXT_REG packets. */
+   0,
+   0,
  };
  static const uint64_t centroid_priority_8x = 0x3546012735460127ull;
  
  /* 16x MSAA (the positions are sorted for EQAA) */
  static const uint32_t sample_locs_16x[] = {
-       FILL_SREG(-5,-2,   5, 3,  -2, 6,   3,-5),
-       FILL_SREG(-4,-6,   1, 1,  -6, 4,   7,-4),
-       FILL_SREG(-1,-3,   6, 7,  -3, 2,   0,-7),
-       FILL_SREG(-7,-8,   2, 5,  -8, 0,   4,-1),
+   FILL_SREG(-5, -2, 5, 3, -2, 6, 3, -5),
+   FILL_SREG(-4, -6, 1, 1, -6, 4, 7, -4),
+   FILL_SREG(-1, -3, 6, 7, -3, 2, 0, -7),
+   FILL_SREG(-7, -8, 2, 5, -8, 0, 4, -1),
  };
  static const uint64_t centroid_priority_16x = 0xc97e64b231d0fa85ull;
  
  static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_count,
-                                  unsigned sample_index, float *out_value)
+                                   unsigned sample_index, float *out_value)
  {
-       const uint32_t *sample_locs;
-
-       switch (sample_count) {
-       case 1:
-       default:
-               sample_locs = &sample_locs_1x;
-               break;
-       case 2:
-               sample_locs = &sample_locs_2x;
-               break;
-       case 4:
-               sample_locs = &sample_locs_4x;
-               break;
-       case 8:
-               sample_locs = sample_locs_8x;
-               break;
-       case 16:
-               sample_locs = sample_locs_16x;
-               break;
-       }
-
-       out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
-       out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
+   const uint32_t *sample_locs;
+
+   switch (sample_count) {
+   case 1:
+   default:
+      sample_locs = &sample_locs_1x;
+      break;
+   case 2:
+      sample_locs = &sample_locs_2x;
+      break;
+   case 4:
+      sample_locs = &sample_locs_4x;
+      break;
+   case 8:
+      sample_locs = sample_locs_8x;
+      break;
+   case 16:
+      sample_locs = sample_locs_16x;
+      break;
+   }
+
+   out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
+   out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
  }
  
-static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs,
-                                     uint64_t centroid_priority,
-                                     uint32_t sample_locs)
+static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority,
+                                      uint32_t sample_locs)
  {
-       radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
-       radeon_emit(cs, centroid_priority);
-       radeon_emit(cs, centroid_priority >> 32);
-       radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
-       radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
-       radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
-       radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
+   radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+   radeon_emit(cs, centroid_priority);
+   radeon_emit(cs, centroid_priority >> 32);
+   radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
+   radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
+   radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
+   radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
  }
  
-static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs,
-                                      uint64_t centroid_priority,
-                                      const uint32_t *sample_locs,
-                                      unsigned num_samples)
+static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority,
+                                       const uint32_t *sample_locs, unsigned num_samples)
  {
-       radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
-       radeon_emit(cs, centroid_priority);
-       radeon_emit(cs, centroid_priority >> 32);
-       radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
-                                  num_samples == 8 ? 14 : 16);
-       radeon_emit_array(cs, sample_locs, 4);
-       radeon_emit_array(cs, sample_locs, 4);
-       radeon_emit_array(cs, sample_locs, 4);
-       radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
+   radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+   radeon_emit(cs, centroid_priority);
+   radeon_emit(cs, centroid_priority >> 32);
+   radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
+                              num_samples == 8 ? 14 : 16);
+   radeon_emit_array(cs, sample_locs, 4);
+   radeon_emit_array(cs, sample_locs, 4);
+   radeon_emit_array(cs, sample_locs, 4);
+   radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
  }
  
  void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples)
  {
-       switch (nr_samples) {
-       default:
-       case 1:
-               si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x);
-               break;
-       case 2:
-               si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x);
-               break;
-       case 4:
-               si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x);
-               break;
-       case 8:
-               si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_8x, 8);
-               break;
-       case 16:
-               si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_16x, 16);
-               break;
-       }
+   switch (nr_samples) {
+   default:
+   case 1:
+      si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x);
+      break;
+   case 2:
+      si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x);
+      break;
+   case 4:
+      si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x);
+      break;
+   case 8:
+      si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_8x, 8);
+      break;
+   case 16:
+      si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_16x, 16);
+      break;
+   }
  }
  
  void si_init_msaa_functions(struct si_context *sctx)
  {
-       int i;
+   int i;
  
-       sctx->b.get_sample_position = si_get_sample_position;
+   sctx->b.get_sample_position = si_get_sample_position;
  
-       si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]);
+   si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]);
  
-       for (i = 0; i < 2; i++)
-               si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]);
-       for (i = 0; i < 4; i++)
-               si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]);
-       for (i = 0; i < 8; i++)
-               si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]);
-       for (i = 0; i < 16; i++)
-               si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]);
+   for (i = 0; i < 2; i++)
+      si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]);
+   for (i = 0; i < 4; i++)
+      si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]);
+   for (i = 0; i < 8; i++)
+      si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]);
+   for (i = 0; i < 16; i++)
+      si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]);
  }
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c

index be7cda1d33245373b46a4d79b997db4b7ca93573..d322cd1f341d67c9c7c237622ed0d3d22f1d9ace 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -22,96 +22,91 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "si_build_pm4.h"
-#include "sid.h"
-
+#include "ac_exp_param.h"
+#include "ac_shader_util.h"
  #include "compiler/nir/nir_serialize.h"
  #include "nir/tgsi_to_nir.h"
-#include "util/hash_table.h"
+#include "si_build_pm4.h"
+#include "sid.h"
  #include "util/crc32.h"
+#include "util/disk_cache.h"
+#include "util/hash_table.h"
+#include "util/mesa-sha1.h"
  #include "util/u_async_debug.h"
  #include "util/u_memory.h"
  #include "util/u_prim.h"
  
-#include "util/disk_cache.h"
-#include "util/mesa-sha1.h"
-#include "ac_exp_param.h"
-#include "ac_shader_util.h"
-
  /* SHADER_CACHE */
  
  /**
   * Return the IR key for the shader cache.
   */
  void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
-                        unsigned char ir_sha1_cache_key[20])
-{
-       struct blob blob = {};
-       unsigned ir_size;
-       void *ir_binary;
-
-       if (sel->nir_binary) {
-               ir_binary = sel->nir_binary;
-               ir_size = sel->nir_size;
-       } else {
-               assert(sel->nir);
-
-               blob_init(&blob);
-               nir_serialize(&blob, sel->nir, true);
-               ir_binary = blob.data;
-               ir_size = blob.size;
-       }
-
-       /* These settings affect the compilation, but they are not derived
-        * from the input shader IR.
-        */
-       unsigned shader_variant_flags = 0;
-
-       if (ngg)
-               shader_variant_flags |= 1 << 0;
-       if (sel->nir)
-               shader_variant_flags |= 1 << 1;
-       if (si_get_wave_size(sel->screen, sel->type, ngg, es, false) == 32)
-               shader_variant_flags |= 1 << 2;
-       if (sel->type == PIPE_SHADER_FRAGMENT &&
-           sel->info.uses_derivatives &&
-           sel->info.uses_kill &&
-           sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
-               shader_variant_flags |= 1 << 3;
-
-       /* This varies depending on whether compute-based culling is enabled. */
-       shader_variant_flags |= sel->screen->num_vbos_in_user_sgprs << 4;
-
-       struct mesa_sha1 ctx;
-       _mesa_sha1_init(&ctx);
-       _mesa_sha1_update(&ctx, &shader_variant_flags, 4);
-       _mesa_sha1_update(&ctx, ir_binary, ir_size);
-       if (sel->type == PIPE_SHADER_VERTEX ||
-           sel->type == PIPE_SHADER_TESS_EVAL ||
-           sel->type == PIPE_SHADER_GEOMETRY)
-               _mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so));
-       _mesa_sha1_final(&ctx, ir_sha1_cache_key);
-
-       if (ir_binary == blob.data)
-               blob_finish(&blob);
+                         unsigned char ir_sha1_cache_key[20])
+{
+   struct blob blob = {};
+   unsigned ir_size;
+   void *ir_binary;
+
+   if (sel->nir_binary) {
+      ir_binary = sel->nir_binary;
+      ir_size = sel->nir_size;
+   } else {
+      assert(sel->nir);
+
+      blob_init(&blob);
+      nir_serialize(&blob, sel->nir, true);
+      ir_binary = blob.data;
+      ir_size = blob.size;
+   }
+
+   /* These settings affect the compilation, but they are not derived
+    * from the input shader IR.
+    */
+   unsigned shader_variant_flags = 0;
+
+   if (ngg)
+      shader_variant_flags |= 1 << 0;
+   if (sel->nir)
+      shader_variant_flags |= 1 << 1;
+   if (si_get_wave_size(sel->screen, sel->type, ngg, es, false) == 32)
+      shader_variant_flags |= 1 << 2;
+   if (sel->type == PIPE_SHADER_FRAGMENT && sel->info.uses_derivatives && sel->info.uses_kill &&
+       sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
+      shader_variant_flags |= 1 << 3;
+
+   /* This varies depending on whether compute-based culling is enabled. */
+   shader_variant_flags |= sel->screen->num_vbos_in_user_sgprs << 4;
+
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+   _mesa_sha1_update(&ctx, &shader_variant_flags, 4);
+   _mesa_sha1_update(&ctx, ir_binary, ir_size);
+   if (sel->type == PIPE_SHADER_VERTEX || sel->type == PIPE_SHADER_TESS_EVAL ||
+       sel->type == PIPE_SHADER_GEOMETRY)
+      _mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so));
+   _mesa_sha1_final(&ctx, ir_sha1_cache_key);
+
+   if (ir_binary == blob.data)
+      blob_finish(&blob);
  }
  
  /** Copy "data" to "ptr" and return the next dword following copied data. */
  static uint32_t *write_data(uint32_t *ptr, const void *data, unsigned size)
  {
-       /* data may be NULL if size == 0 */
-       if (size)
-               memcpy(ptr, data, size);
-       ptr += DIV_ROUND_UP(size, 4);
-       return ptr;
+   /* data may be NULL if size == 0 */
+   if (size)
+      memcpy(ptr, data, size);
+   ptr += DIV_ROUND_UP(size, 4);
+   return ptr;
  }
  
  /** Read data from "ptr". Return the next dword following the data. */
  static uint32_t *read_data(uint32_t *ptr, void *data, unsigned size)
  {
-       memcpy(data, ptr, size);
-       ptr += DIV_ROUND_UP(size, 4);
-       return ptr;
+   memcpy(data, ptr, size);
+   ptr += DIV_ROUND_UP(size, 4);
+   return ptr;
  }
  
  /**
@@ -120,8 +115,8 @@ static uint32_t *read_data(uint32_t *ptr, void *data, unsigned size)
   */
  static uint32_t *write_chunk(uint32_t *ptr, const void *data, unsigned size)
  {
-       *ptr++ = size;
-       return write_data(ptr, data, size);
+   *ptr++ = size;
+   return write_data(ptr, data, size);
  }
  
  /**
@@ -130,12 +125,12 @@ static uint32_t *write_chunk(uint32_t *ptr, const void *data, unsigned size)
   */
  static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size)
  {
-       *size = *ptr++;
-       assert(*data == NULL);
-       if (!*size)
-               return ptr;
-       *data = malloc(*size);
-       return read_data(ptr, *data, *size);
+   *size = *ptr++;
+   assert(*data == NULL);
+   if (!*size)
+      return ptr;
+   *data = malloc(*size);
+   return read_data(ptr, *data, *size);
  }
  
  /**
@@ -144,258 +139,236 @@ static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size)
   */
  static void *si_get_shader_binary(struct si_shader *shader)
  {
-       /* There is always a size of data followed by the data itself. */
-       unsigned llvm_ir_size = shader->binary.llvm_ir_string ?
-                               strlen(shader->binary.llvm_ir_string) + 1 : 0;
-
-       /* Refuse to allocate overly large buffers and guard against integer
-        * overflow. */
-       if (shader->binary.elf_size > UINT_MAX / 4 ||
-           llvm_ir_size > UINT_MAX / 4)
-               return NULL;
-
-       unsigned size =
-               4 + /* total size */
-               4 + /* CRC32 of the data below */
-               align(sizeof(shader->config), 4) +
-               align(sizeof(shader->info), 4) +
-               4 + align(shader->binary.elf_size, 4) +
-               4 + align(llvm_ir_size, 4);
-       void *buffer = CALLOC(1, size);
-       uint32_t *ptr = (uint32_t*)buffer;
-
-       if (!buffer)
-               return NULL;
-
-       *ptr++ = size;
-       ptr++; /* CRC32 is calculated at the end. */
-
-       ptr = write_data(ptr, &shader->config, sizeof(shader->config));
-       ptr = write_data(ptr, &shader->info, sizeof(shader->info));
-       ptr = write_chunk(ptr, shader->binary.elf_buffer, shader->binary.elf_size);
-       ptr = write_chunk(ptr, shader->binary.llvm_ir_string, llvm_ir_size);
-       assert((char *)ptr - (char *)buffer == size);
-
-       /* Compute CRC32. */
-       ptr = (uint32_t*)buffer;
-       ptr++;
-       *ptr = util_hash_crc32(ptr + 1, size - 8);
-
-       return buffer;
+   /* There is always a size of data followed by the data itself. */
+   unsigned llvm_ir_size =
+      shader->binary.llvm_ir_string ? strlen(shader->binary.llvm_ir_string) + 1 : 0;
+
+   /* Refuse to allocate overly large buffers and guard against integer
+    * overflow. */
+   if (shader->binary.elf_size > UINT_MAX / 4 || llvm_ir_size > UINT_MAX / 4)
+      return NULL;
+
+   unsigned size = 4 + /* total size */
+                   4 + /* CRC32 of the data below */
+                   align(sizeof(shader->config), 4) + align(sizeof(shader->info), 4) + 4 +
+                   align(shader->binary.elf_size, 4) + 4 + align(llvm_ir_size, 4);
+   void *buffer = CALLOC(1, size);
+   uint32_t *ptr = (uint32_t *)buffer;
+
+   if (!buffer)
+      return NULL;
+
+   *ptr++ = size;
+   ptr++; /* CRC32 is calculated at the end. */
+
+   ptr = write_data(ptr, &shader->config, sizeof(shader->config));
+   ptr = write_data(ptr, &shader->info, sizeof(shader->info));
+   ptr = write_chunk(ptr, shader->binary.elf_buffer, shader->binary.elf_size);
+   ptr = write_chunk(ptr, shader->binary.llvm_ir_string, llvm_ir_size);
+   assert((char *)ptr - (char *)buffer == size);
+
+   /* Compute CRC32. */
+   ptr = (uint32_t *)buffer;
+   ptr++;
+   *ptr = util_hash_crc32(ptr + 1, size - 8);
+
+   return buffer;
  }
  
  static bool si_load_shader_binary(struct si_shader *shader, void *binary)
  {
-       uint32_t *ptr = (uint32_t*)binary;
-       uint32_t size = *ptr++;
-       uint32_t crc32 = *ptr++;
-       unsigned chunk_size;
-       unsigned elf_size;
-
-       if (util_hash_crc32(ptr, size - 8) != crc32) {
-               fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n");
-               return false;
-       }
-
-       ptr = read_data(ptr, &shader->config, sizeof(shader->config));
-       ptr = read_data(ptr, &shader->info, sizeof(shader->info));
-       ptr = read_chunk(ptr, (void**)&shader->binary.elf_buffer,
-                        &elf_size);
-       shader->binary.elf_size = elf_size;
-       ptr = read_chunk(ptr, (void**)&shader->binary.llvm_ir_string, &chunk_size);
-
-       return true;
+   uint32_t *ptr = (uint32_t *)binary;
+   uint32_t size = *ptr++;
+   uint32_t crc32 = *ptr++;
+   unsigned chunk_size;
+   unsigned elf_size;
+
+   if (util_hash_crc32(ptr, size - 8) != crc32) {
+      fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n");
+      return false;
+   }
+
+   ptr = read_data(ptr, &shader->config, sizeof(shader->config));
+   ptr = read_data(ptr, &shader->info, sizeof(shader->info));
+   ptr = read_chunk(ptr, (void **)&shader->binary.elf_buffer, &elf_size);
+   shader->binary.elf_size = elf_size;
+   ptr = read_chunk(ptr, (void **)&shader->binary.llvm_ir_string, &chunk_size);
+
+   return true;
  }
  
  /**
   * Insert a shader into the cache. It's assumed the shader is not in the cache.
   * Use si_shader_cache_load_shader before calling this.
   */
-void si_shader_cache_insert_shader(struct si_screen *sscreen,
-                                  unsigned char ir_sha1_cache_key[20],
-                                  struct si_shader *shader,
-                                  bool insert_into_disk_cache)
-{
-       void *hw_binary;
-       struct hash_entry *entry;
-       uint8_t key[CACHE_KEY_SIZE];
-
-       entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
-       if (entry)
-               return; /* already added */
-
-       hw_binary = si_get_shader_binary(shader);
-       if (!hw_binary)
-               return;
-
-       if (_mesa_hash_table_insert(sscreen->shader_cache,
-                                   mem_dup(ir_sha1_cache_key, 20),
-                                   hw_binary) == NULL) {
-               FREE(hw_binary);
-               return;
-       }
-
-       if (sscreen->disk_shader_cache && insert_into_disk_cache) {
-               disk_cache_compute_key(sscreen->disk_shader_cache,
-                                      ir_sha1_cache_key, 20, key);
-               disk_cache_put(sscreen->disk_shader_cache, key, hw_binary,
-                              *((uint32_t *) hw_binary), NULL);
-       }
-}
-
-bool si_shader_cache_load_shader(struct si_screen *sscreen,
-                                unsigned char ir_sha1_cache_key[20],
-                                struct si_shader *shader)
-{
-       struct hash_entry *entry =
-               _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
-
-       if (entry) {
-               if (si_load_shader_binary(shader, entry->data)) {
-                       p_atomic_inc(&sscreen->num_memory_shader_cache_hits);
-                       return true;
-               }
-       }
-       p_atomic_inc(&sscreen->num_memory_shader_cache_misses);
-
-       if (!sscreen->disk_shader_cache)
-               return false;
-
-       unsigned char sha1[CACHE_KEY_SIZE];
-       disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key,
-                              20, sha1);
-
-       size_t binary_size;
-       uint8_t *buffer = disk_cache_get(sscreen->disk_shader_cache, sha1,
-                                        &binary_size);
-       if (buffer) {
-               if (binary_size >= sizeof(uint32_t) &&
-                   *((uint32_t*)buffer) == binary_size) {
-                       if (si_load_shader_binary(shader, buffer)) {
-                               free(buffer);
-                               si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key,
-                                                             shader, false);
-                               p_atomic_inc(&sscreen->num_disk_shader_cache_hits);
-                               return true;
-                       }
-               } else {
-                       /* Something has gone wrong discard the item from the cache and
-                        * rebuild/link from source.
-                        */
-                       assert(!"Invalid radeonsi shader disk cache item!");
-                       disk_cache_remove(sscreen->disk_shader_cache, sha1);
-               }
-       }
-
-       free(buffer);
-       p_atomic_inc(&sscreen->num_disk_shader_cache_misses);
-       return false;
+void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
+                                   struct si_shader *shader, bool insert_into_disk_cache)
+{
+   void *hw_binary;
+   struct hash_entry *entry;
+   uint8_t key[CACHE_KEY_SIZE];
+
+   entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
+   if (entry)
+      return; /* already added */
+
+   hw_binary = si_get_shader_binary(shader);
+   if (!hw_binary)
+      return;
+
+   if (_mesa_hash_table_insert(sscreen->shader_cache, mem_dup(ir_sha1_cache_key, 20), hw_binary) ==
+       NULL) {
+      FREE(hw_binary);
+      return;
+   }
+
+   if (sscreen->disk_shader_cache && insert_into_disk_cache) {
+      disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, key);
+      disk_cache_put(sscreen->disk_shader_cache, key, hw_binary, *((uint32_t *)hw_binary), NULL);
+   }
+}
+
+bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
+                                 struct si_shader *shader)
+{
+   struct hash_entry *entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
+
+   if (entry) {
+      if (si_load_shader_binary(shader, entry->data)) {
+         p_atomic_inc(&sscreen->num_memory_shader_cache_hits);
+         return true;
+      }
+   }
+   p_atomic_inc(&sscreen->num_memory_shader_cache_misses);
+
+   if (!sscreen->disk_shader_cache)
+      return false;
+
+   unsigned char sha1[CACHE_KEY_SIZE];
+   disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, sha1);
+
+   size_t binary_size;
+   uint8_t *buffer = disk_cache_get(sscreen->disk_shader_cache, sha1, &binary_size);
+   if (buffer) {
+      if (binary_size >= sizeof(uint32_t) && *((uint32_t *)buffer) == binary_size) {
+         if (si_load_shader_binary(shader, buffer)) {
+            free(buffer);
+            si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, false);
+            p_atomic_inc(&sscreen->num_disk_shader_cache_hits);
+            return true;
+         }
+      } else {
+         /* Something has gone wrong discard the item from the cache and
+          * rebuild/link from source.
+          */
+         assert(!"Invalid radeonsi shader disk cache item!");
+         disk_cache_remove(sscreen->disk_shader_cache, sha1);
+      }
+   }
+
+   free(buffer);
+   p_atomic_inc(&sscreen->num_disk_shader_cache_misses);
+   return false;
  }
  
  static uint32_t si_shader_cache_key_hash(const void *key)
  {
-       /* Take the first dword of SHA1. */
-       return *(uint32_t*)key;
+   /* Take the first dword of SHA1. */
+   return *(uint32_t *)key;
  }
  
  static bool si_shader_cache_key_equals(const void *a, const void *b)
  {
-       /* Compare SHA1s. */
-       return memcmp(a, b, 20) == 0;
+   /* Compare SHA1s. */
+   return memcmp(a, b, 20) == 0;
  }
  
  static void si_destroy_shader_cache_entry(struct hash_entry *entry)
  {
-       FREE((void*)entry->key);
-       FREE(entry->data);
+   FREE((void *)entry->key);
+   FREE(entry->data);
  }
  
  bool si_init_shader_cache(struct si_screen *sscreen)
  {
-       (void) simple_mtx_init(&sscreen->shader_cache_mutex, mtx_plain);
-       sscreen->shader_cache =
-               _mesa_hash_table_create(NULL,
-                                       si_shader_cache_key_hash,
-                                       si_shader_cache_key_equals);
+   (void)simple_mtx_init(&sscreen->shader_cache_mutex, mtx_plain);
+   sscreen->shader_cache =
+      _mesa_hash_table_create(NULL, si_shader_cache_key_hash, si_shader_cache_key_equals);
  
-       return sscreen->shader_cache != NULL;
+   return sscreen->shader_cache != NULL;
  }
  
  void si_destroy_shader_cache(struct si_screen *sscreen)
  {
-       if (sscreen->shader_cache)
-               _mesa_hash_table_destroy(sscreen->shader_cache,
-                                        si_destroy_shader_cache_entry);
-       simple_mtx_destroy(&sscreen->shader_cache_mutex);
+   if (sscreen->shader_cache)
+      _mesa_hash_table_destroy(sscreen->shader_cache, si_destroy_shader_cache_entry);
+   simple_mtx_destroy(&sscreen->shader_cache_mutex);
  }
  
  /* SHADER STATES */
  
-static void si_set_tesseval_regs(struct si_screen *sscreen,
-                                const struct si_shader_selector *tes,
-                                struct si_pm4_state *pm4)
-{
-       const struct si_shader_info *info = &tes->info;
-       unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
-       unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
-       bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
-       bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE];
-       unsigned type, partitioning, topology, distribution_mode;
-
-       switch (tes_prim_mode) {
-       case PIPE_PRIM_LINES:
-               type = V_028B6C_TESS_ISOLINE;
-               break;
-       case PIPE_PRIM_TRIANGLES:
-               type = V_028B6C_TESS_TRIANGLE;
-               break;
-       case PIPE_PRIM_QUADS:
-               type = V_028B6C_TESS_QUAD;
-               break;
-       default:
-               assert(0);
-               return;
-       }
-
-       switch (tes_spacing) {
-       case PIPE_TESS_SPACING_FRACTIONAL_ODD:
-               partitioning = V_028B6C_PART_FRAC_ODD;
-               break;
-       case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
-               partitioning = V_028B6C_PART_FRAC_EVEN;
-               break;
-       case PIPE_TESS_SPACING_EQUAL:
-               partitioning = V_028B6C_PART_INTEGER;
-               break;
-       default:
-               assert(0);
-               return;
-       }
-
-       if (tes_point_mode)
-               topology = V_028B6C_OUTPUT_POINT;
-       else if (tes_prim_mode == PIPE_PRIM_LINES)
-               topology = V_028B6C_OUTPUT_LINE;
-       else if (tes_vertex_order_cw)
-               /* for some reason, this must be the other way around */
-               topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
-       else
-               topology = V_028B6C_OUTPUT_TRIANGLE_CW;
-
-       if (sscreen->info.has_distributed_tess) {
-               if (sscreen->info.family == CHIP_FIJI ||
-                   sscreen->info.family >= CHIP_POLARIS10)
-                       distribution_mode = V_028B6C_DISTRIBUTION_MODE_TRAPEZOIDS;
-               else
-                       distribution_mode = V_028B6C_DISTRIBUTION_MODE_DONUTS;
-       } else
-               distribution_mode = V_028B6C_DISTRIBUTION_MODE_NO_DIST;
-
-       assert(pm4->shader);
-       pm4->shader->vgt_tf_param = S_028B6C_TYPE(type) |
-                                   S_028B6C_PARTITIONING(partitioning) |
-                                   S_028B6C_TOPOLOGY(topology) |
-                                   S_028B6C_DISTRIBUTION_MODE(distribution_mode);
+static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shader_selector *tes,
+                                 struct si_pm4_state *pm4)
+{
+   const struct si_shader_info *info = &tes->info;
+   unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
+   unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
+   bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
+   bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE];
+   unsigned type, partitioning, topology, distribution_mode;
+
+   switch (tes_prim_mode) {
+   case PIPE_PRIM_LINES:
+      type = V_028B6C_TESS_ISOLINE;
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      type = V_028B6C_TESS_TRIANGLE;
+      break;
+   case PIPE_PRIM_QUADS:
+      type = V_028B6C_TESS_QUAD;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   switch (tes_spacing) {
+   case PIPE_TESS_SPACING_FRACTIONAL_ODD:
+      partitioning = V_028B6C_PART_FRAC_ODD;
+      break;
+   case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
+      partitioning = V_028B6C_PART_FRAC_EVEN;
+      break;
+   case PIPE_TESS_SPACING_EQUAL:
+      partitioning = V_028B6C_PART_INTEGER;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   if (tes_point_mode)
+      topology = V_028B6C_OUTPUT_POINT;
+   else if (tes_prim_mode == PIPE_PRIM_LINES)
+      topology = V_028B6C_OUTPUT_LINE;
+   else if (tes_vertex_order_cw)
+      /* for some reason, this must be the other way around */
+      topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
+   else
+      topology = V_028B6C_OUTPUT_TRIANGLE_CW;
+
+   if (sscreen->info.has_distributed_tess) {
+      if (sscreen->info.family == CHIP_FIJI || sscreen->info.family >= CHIP_POLARIS10)
+         distribution_mode = V_028B6C_DISTRIBUTION_MODE_TRAPEZOIDS;
+      else
+         distribution_mode = V_028B6C_DISTRIBUTION_MODE_DONUTS;
+   } else
+      distribution_mode = V_028B6C_DISTRIBUTION_MODE_NO_DIST;
+
+   assert(pm4->shader);
+   pm4->shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
+                               S_028B6C_TOPOLOGY(topology) |
+                               S_028B6C_DISTRIBUTION_MODE(distribution_mode);
  }
  
  /* Polaris needs different VTX_REUSE_DEPTH settings depending on
@@ -412,722 +385,674 @@ static void si_set_tesseval_regs(struct si_screen *sscreen,
   *
   * If "shader" is NULL, it's assumed it's not LS or GS copy shader.
   */
-static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen,
-                                        struct si_shader_selector *sel,
-                                        struct si_shader *shader,
-                                        struct si_pm4_state *pm4)
+static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_shader_selector *sel,
+                                         struct si_shader *shader, struct si_pm4_state *pm4)
  {
-       unsigned type = sel->type;
-
-       if (sscreen->info.family < CHIP_POLARIS10 ||
-           sscreen->info.chip_class >= GFX10)
-               return;
-
-       /* VS as VS, or VS as ES: */
-       if ((type == PIPE_SHADER_VERTEX &&
-            (!shader ||
-             (!shader->key.as_ls && !shader->is_gs_copy_shader))) ||
-           /* TES as VS, or TES as ES: */
-           type == PIPE_SHADER_TESS_EVAL) {
-               unsigned vtx_reuse_depth = 30;
-
-               if (type == PIPE_SHADER_TESS_EVAL &&
-                   sel->info.properties[TGSI_PROPERTY_TES_SPACING] ==
-                   PIPE_TESS_SPACING_FRACTIONAL_ODD)
-                       vtx_reuse_depth = 14;
-
-               assert(pm4->shader);
-               pm4->shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth;
-       }
+   unsigned type = sel->type;
+
+   if (sscreen->info.family < CHIP_POLARIS10 || sscreen->info.chip_class >= GFX10)
+      return;
+
+   /* VS as VS, or VS as ES: */
+   if ((type == PIPE_SHADER_VERTEX &&
+        (!shader || (!shader->key.as_ls && !shader->is_gs_copy_shader))) ||
+       /* TES as VS, or TES as ES: */
+       type == PIPE_SHADER_TESS_EVAL) {
+      unsigned vtx_reuse_depth = 30;
+
+      if (type == PIPE_SHADER_TESS_EVAL &&
+          sel->info.properties[TGSI_PROPERTY_TES_SPACING] == PIPE_TESS_SPACING_FRACTIONAL_ODD)
+         vtx_reuse_depth = 14;
+
+      assert(pm4->shader);
+      pm4->shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth;
+   }
  }
  
  static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader)
  {
-       if (shader->pm4)
-               si_pm4_clear_state(shader->pm4);
-       else
-               shader->pm4 = CALLOC_STRUCT(si_pm4_state);
-
-       if (shader->pm4) {
-               shader->pm4->shader = shader;
-               return shader->pm4;
-       } else {
-               fprintf(stderr, "radeonsi: Failed to create pm4 state.\n");
-               return NULL;
-       }
+   if (shader->pm4)
+      si_pm4_clear_state(shader->pm4);
+   else
+      shader->pm4 = CALLOC_STRUCT(si_pm4_state);
+
+   if (shader->pm4) {
+      shader->pm4->shader = shader;
+      return shader->pm4;
+   } else {
+      fprintf(stderr, "radeonsi: Failed to create pm4 state.\n");
+      return NULL;
+   }
  }
  
  static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader,
-                                        unsigned num_always_on_user_sgprs)
+                                         unsigned num_always_on_user_sgprs)
  {
-       struct si_shader_selector *vs = shader->previous_stage_sel ?
-                       shader->previous_stage_sel : shader->selector;
-       unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs;
+   struct si_shader_selector *vs =
+      shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
+   unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs;
  
-       /* 1 SGPR is reserved for the vertex buffer pointer. */
-       assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1);
+   /* 1 SGPR is reserved for the vertex buffer pointer. */
+   assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1);
  
-       if (num_vbos_in_user_sgprs)
-               return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4;
+   if (num_vbos_in_user_sgprs)
+      return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4;
  
-       /* Add the pointer to VBO descriptors. */
-       return num_always_on_user_sgprs + 1;
+   /* Add the pointer to VBO descriptors. */
+   return num_always_on_user_sgprs + 1;
  }
  
  /* Return VGPR_COMP_CNT for the API vertex shader. This can be hw LS, LSHS, ES, ESGS, VS. */
-static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen,
-                                       struct si_shader *shader, bool legacy_vs_prim_id)
-{
-       assert(shader->selector->type == PIPE_SHADER_VERTEX ||
-              (shader->previous_stage_sel &&
-               shader->previous_stage_sel->type == PIPE_SHADER_VERTEX));
-
-       /* GFX6-9 LS    (VertexID, RelAutoindex,                InstanceID / StepRate0(==1), ...).
-        * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0(==1), VSPrimID,                    ...)
-        * GFX10  LS    (VertexID, RelAutoindex,                UserVGPR1,                   InstanceID).
-        * GFX10  ES,VS (VertexID, UserVGPR0,                   UserVGPR1 or VSPrimID,       UserVGPR2 or InstanceID)
-        */
-       bool is_ls = shader->selector->type == PIPE_SHADER_TESS_CTRL || shader->key.as_ls;
-
-       if (sscreen->info.chip_class >= GFX10 && shader->info.uses_instanceid)
-               return 3;
-       else if ((is_ls && shader->info.uses_instanceid) || legacy_vs_prim_id)
-               return 2;
-       else if (is_ls || shader->info.uses_instanceid)
-               return 1;
-       else
-               return 0;
+static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, struct si_shader *shader,
+                                        bool legacy_vs_prim_id)
+{
+   assert(shader->selector->type == PIPE_SHADER_VERTEX ||
+          (shader->previous_stage_sel && shader->previous_stage_sel->type == PIPE_SHADER_VERTEX));
+
+   /* GFX6-9 LS    (VertexID, RelAutoindex,                InstanceID / StepRate0(==1), ...).
+    * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0(==1), VSPrimID,                    ...)
+    * GFX10  LS    (VertexID, RelAutoindex,                UserVGPR1,                   InstanceID).
+    * GFX10  ES,VS (VertexID, UserVGPR0,                   UserVGPR1 or VSPrimID,       UserVGPR2 or
+    * InstanceID)
+    */
+   bool is_ls = shader->selector->type == PIPE_SHADER_TESS_CTRL || shader->key.as_ls;
+
+   if (sscreen->info.chip_class >= GFX10 && shader->info.uses_instanceid)
+      return 3;
+   else if ((is_ls && shader->info.uses_instanceid) || legacy_vs_prim_id)
+      return 2;
+   else if (is_ls || shader->info.uses_instanceid)
+      return 1;
+   else
+      return 0;
  }
  
  static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
  {
-       struct si_pm4_state *pm4;
-       uint64_t va;
+   struct si_pm4_state *pm4;
+   uint64_t va;
  
-       assert(sscreen->info.chip_class <= GFX8);
+   assert(sscreen->info.chip_class <= GFX8);
  
-       pm4 = si_get_shader_pm4_state(shader);
-       if (!pm4)
-               return;
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
  
-       va = shader->bo->gpu_address;
-       si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+   va = shader->bo->gpu_address;
+   si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
  
-       si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
-       si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
+   si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+   si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
  
-       shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
-                          S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) |
-                          S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) |
-                          S_00B528_DX10_CLAMP(1) |
-                          S_00B528_FLOAT_MODE(shader->config.float_mode);
-       shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) |
-                          S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+   shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
+                          S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) |
+                          S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) |
+                          S_00B528_DX10_CLAMP(1) | S_00B528_FLOAT_MODE(shader->config.float_mode);
+   shader->config.rsrc2 =
+      S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) |
+      S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
  }
  
  static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
  {
-       struct si_pm4_state *pm4;
-       uint64_t va;
-
-       pm4 = si_get_shader_pm4_state(shader);
-       if (!pm4)
-               return;
-
-       va = shader->bo->gpu_address;
-       si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
-       if (sscreen->info.chip_class >= GFX9) {
-               if (sscreen->info.chip_class >= GFX10) {
-                       si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
-                       si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
-               } else {
-                       si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
-                       si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, S_00B414_MEM_BASE(va >> 40));
-               }
-
-               unsigned num_user_sgprs =
-                       si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);
-
-               shader->config.rsrc2 =
-                       S_00B42C_USER_SGPR(num_user_sgprs) |
-                       S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
-
-               if (sscreen->info.chip_class >= GFX10)
-                       shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
-               else
-                       shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
-       } else {
-               si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
-               si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, S_00B424_MEM_BASE(va >> 40));
-
-               shader->config.rsrc2 =
-                       S_00B42C_USER_SGPR(GFX6_TCS_NUM_USER_SGPR) |
-                       S_00B42C_OC_LDS_EN(1) |
-                       S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
-       }
-
-       si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
-                      S_00B428_VGPRS((shader->config.num_vgprs - 1) /
-                                     (sscreen->ge_wave_size == 32 ? 8 : 4)) |
-                      (sscreen->info.chip_class <= GFX9 ?
-                               S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8) : 0) |
-                      S_00B428_DX10_CLAMP(1) |
-                      S_00B428_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
-                      S_00B428_WGP_MODE(sscreen->info.chip_class >= GFX10) |
-                      S_00B428_FLOAT_MODE(shader->config.float_mode) |
-                      S_00B428_LS_VGPR_COMP_CNT(sscreen->info.chip_class >= GFX9 ?
-                                                si_get_vs_vgpr_comp_cnt(sscreen, shader, false) : 0));
-
-       if (sscreen->info.chip_class <= GFX8) {
-               si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
-                              shader->config.rsrc2);
-       }
+   struct si_pm4_state *pm4;
+   uint64_t va;
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   va = shader->bo->gpu_address;
+   si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+   if (sscreen->info.chip_class >= GFX9) {
+      if (sscreen->info.chip_class >= GFX10) {
+         si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+         si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
+      } else {
+         si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
+         si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, S_00B414_MEM_BASE(va >> 40));
+      }
+
+      unsigned num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);
+
+      shader->config.rsrc2 = S_00B42C_USER_SGPR(num_user_sgprs) |
+                             S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+
+      if (sscreen->info.chip_class >= GFX10)
+         shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+      else
+         shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+   } else {
+      si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
+      si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, S_00B424_MEM_BASE(va >> 40));
+
+      shader->config.rsrc2 = S_00B42C_USER_SGPR(GFX6_TCS_NUM_USER_SGPR) | S_00B42C_OC_LDS_EN(1) |
+                             S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+   }
+
+   si_pm4_set_reg(
+      pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
+      S_00B428_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
+         (sscreen->info.chip_class <= GFX9 ? S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8)
+                                           : 0) |
+         S_00B428_DX10_CLAMP(1) | S_00B428_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+         S_00B428_WGP_MODE(sscreen->info.chip_class >= GFX10) |
+         S_00B428_FLOAT_MODE(shader->config.float_mode) |
+         S_00B428_LS_VGPR_COMP_CNT(sscreen->info.chip_class >= GFX9
+                                      ? si_get_vs_vgpr_comp_cnt(sscreen, shader, false)
+                                      : 0));
+
+   if (sscreen->info.chip_class <= GFX8) {
+      si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, shader->config.rsrc2);
+   }
  }
  
  static void si_emit_shader_es(struct si_context *sctx)
  {
-       struct si_shader *shader = sctx->queued.named.es->shader;
-       unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   struct si_shader *shader = sctx->queued.named.es->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
  
-       if (!shader)
-               return;
+   if (!shader)
+      return;
  
-       radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
-                                  SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
-                                  shader->selector->esgs_itemsize / 4);
+   radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+                              SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+                              shader->selector->esgs_itemsize / 4);
  
-       if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
-               radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
-                                          SI_TRACKED_VGT_TF_PARAM,
-                                          shader->vgt_tf_param);
+   if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+      radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                                 shader->vgt_tf_param);
  
-       if (shader->vgt_vertex_reuse_block_cntl)
-               radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
-                                          SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
-                                          shader->vgt_vertex_reuse_block_cntl);
+   if (shader->vgt_vertex_reuse_block_cntl)
+      radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                 SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                 shader->vgt_vertex_reuse_block_cntl);
  
-       if (initial_cdw != sctx->gfx_cs->current.cdw)
-               sctx->context_roll = true;
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
  }
  
  static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
  {
-       struct si_pm4_state *pm4;
-       unsigned num_user_sgprs;
-       unsigned vgpr_comp_cnt;
-       uint64_t va;
-       unsigned oc_lds_en;
-
-       assert(sscreen->info.chip_class <= GFX8);
-
-       pm4 = si_get_shader_pm4_state(shader);
-       if (!pm4)
-               return;
-
-       pm4->atom.emit = si_emit_shader_es;
-       va = shader->bo->gpu_address;
-       si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
-       if (shader->selector->type == PIPE_SHADER_VERTEX) {
-               vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
-               num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
-       } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
-               vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
-               num_user_sgprs = SI_TES_NUM_USER_SGPR;
-       } else
-               unreachable("invalid shader selector type");
-
-       oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0;
-
-       si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
-       si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
-       si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
-                      S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
-                      S_00B328_SGPRS((shader->config.num_sgprs - 1) / 8) |
-                      S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
-                      S_00B328_DX10_CLAMP(1) |
-                      S_00B328_FLOAT_MODE(shader->config.float_mode));
-       si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
-                      S_00B32C_USER_SGPR(num_user_sgprs) |
-                      S_00B32C_OC_LDS_EN(oc_lds_en) |
-                      S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
-
-       if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
-               si_set_tesseval_regs(sscreen, shader->selector, pm4);
-
-       polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
-}
-
-void gfx9_get_gs_info(struct si_shader_selector *es,
-                     struct si_shader_selector *gs,
-                     struct gfx9_gs_info *out)
-{
-       unsigned gs_num_invocations = MAX2(gs->gs_num_invocations, 1);
-       unsigned input_prim = gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
-       bool uses_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY &&
-                             input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
-
-       /* All these are in dwords: */
-       /* We can't allow using the whole LDS, because GS waves compete with
-        * other shader stages for LDS space. */
-       const unsigned max_lds_size = 8 * 1024;
-       const unsigned esgs_itemsize = es->esgs_itemsize / 4;
-       unsigned esgs_lds_size;
-
-       /* All these are per subgroup: */
-       const unsigned max_out_prims = 32 * 1024;
-       const unsigned max_es_verts = 255;
-       const unsigned ideal_gs_prims = 64;
-       unsigned max_gs_prims, gs_prims;
-       unsigned min_es_verts, es_verts, worst_case_es_verts;
-
-       if (uses_adjacency || gs_num_invocations > 1)
-               max_gs_prims = 127 / gs_num_invocations;
-       else
-               max_gs_prims = 255;
-
-       /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
-        * Make sure we don't go over the maximum value.
-        */
-       if (gs->gs_max_out_vertices > 0) {
-               max_gs_prims = MIN2(max_gs_prims,
-                                   max_out_prims /
-                                   (gs->gs_max_out_vertices * gs_num_invocations));
-       }
-       assert(max_gs_prims > 0);
-
-       /* If the primitive has adjacency, halve the number of vertices
-        * that will be reused in multiple primitives.
-        */
-       min_es_verts = gs->gs_input_verts_per_prim / (uses_adjacency ? 2 : 1);
-
-       gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
-       worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
-
-       /* Compute ESGS LDS size based on the worst case number of ES vertices
-        * needed to create the target number of GS prims per subgroup.
-        */
-       esgs_lds_size = esgs_itemsize * worst_case_es_verts;
-
-       /* If total LDS usage is too big, refactor partitions based on ratio
-        * of ESGS item sizes.
-        */
-       if (esgs_lds_size > max_lds_size) {
-               /* Our target GS Prims Per Subgroup was too large. Calculate
-                * the maximum number of GS Prims Per Subgroup that will fit
-                * into LDS, capped by the maximum that the hardware can support.
-                */
-               gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)),
-                               max_gs_prims);
-               assert(gs_prims > 0);
-               worst_case_es_verts = MIN2(min_es_verts * gs_prims,
-                                          max_es_verts);
-
-               esgs_lds_size = esgs_itemsize * worst_case_es_verts;
-               assert(esgs_lds_size <= max_lds_size);
-       }
-
-       /* Now calculate remaining ESGS information. */
-       if (esgs_lds_size)
-               es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
-       else
-               es_verts = max_es_verts;
-
-       /* Vertices for adjacency primitives are not always reused, so restore
-        * it for ES_VERTS_PER_SUBGRP.
-        */
-       min_es_verts = gs->gs_input_verts_per_prim;
-
-       /* For normal primitives, the VGT only checks if they are past the ES
-        * verts per subgroup after allocating a full GS primitive and if they
-        * are, kick off a new subgroup.  But if those additional ES verts are
-        * unique (e.g. not reused) we need to make sure there is enough LDS
-        * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
-        */
-       es_verts -= min_es_verts - 1;
-
-       out->es_verts_per_subgroup = es_verts;
-       out->gs_prims_per_subgroup = gs_prims;
-       out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
-       out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup *
-                                     gs->gs_max_out_vertices;
-       out->esgs_ring_size = 4 * esgs_lds_size;
-
-       assert(out->max_prims_per_subgroup <= max_out_prims);
+   struct si_pm4_state *pm4;
+   unsigned num_user_sgprs;
+   unsigned vgpr_comp_cnt;
+   uint64_t va;
+   unsigned oc_lds_en;
+
+   assert(sscreen->info.chip_class <= GFX8);
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   pm4->atom.emit = si_emit_shader_es;
+   va = shader->bo->gpu_address;
+   si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+   if (shader->selector->type == PIPE_SHADER_VERTEX) {
+      vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
+      num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
+   } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+      vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
+      num_user_sgprs = SI_TES_NUM_USER_SGPR;
+   } else
+      unreachable("invalid shader selector type");
+
+   oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0;
+
+   si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+   si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
+   si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
+                  S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
+                     S_00B328_SGPRS((shader->config.num_sgprs - 1) / 8) |
+                     S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B328_DX10_CLAMP(1) |
+                     S_00B328_FLOAT_MODE(shader->config.float_mode));
+   si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
+                  S_00B32C_USER_SGPR(num_user_sgprs) | S_00B32C_OC_LDS_EN(oc_lds_en) |
+                     S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+
+   if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+      si_set_tesseval_regs(sscreen, shader->selector, pm4);
+
+   polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
+}
+
+void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
+                      struct gfx9_gs_info *out)
+{
+   unsigned gs_num_invocations = MAX2(gs->gs_num_invocations, 1);
+   unsigned input_prim = gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
+   bool uses_adjacency =
+      input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
+
+   /* All these are in dwords: */
+   /* We can't allow using the whole LDS, because GS waves compete with
+    * other shader stages for LDS space. */
+   const unsigned max_lds_size = 8 * 1024;
+   const unsigned esgs_itemsize = es->esgs_itemsize / 4;
+   unsigned esgs_lds_size;
+
+   /* All these are per subgroup: */
+   const unsigned max_out_prims = 32 * 1024;
+   const unsigned max_es_verts = 255;
+   const unsigned ideal_gs_prims = 64;
+   unsigned max_gs_prims, gs_prims;
+   unsigned min_es_verts, es_verts, worst_case_es_verts;
+
+   if (uses_adjacency || gs_num_invocations > 1)
+      max_gs_prims = 127 / gs_num_invocations;
+   else
+      max_gs_prims = 255;
+
+   /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
+    * Make sure we don't go over the maximum value.
+    */
+   if (gs->gs_max_out_vertices > 0) {
+      max_gs_prims =
+         MIN2(max_gs_prims, max_out_prims / (gs->gs_max_out_vertices * gs_num_invocations));
+   }
+   assert(max_gs_prims > 0);
+
+   /* If the primitive has adjacency, halve the number of vertices
+    * that will be reused in multiple primitives.
+    */
+   min_es_verts = gs->gs_input_verts_per_prim / (uses_adjacency ? 2 : 1);
+
+   gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
+   worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+   /* Compute ESGS LDS size based on the worst case number of ES vertices
+    * needed to create the target number of GS prims per subgroup.
+    */
+   esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+
+   /* If total LDS usage is too big, refactor partitions based on ratio
+    * of ESGS item sizes.
+    */
+   if (esgs_lds_size > max_lds_size) {
+      /* Our target GS Prims Per Subgroup was too large. Calculate
+       * the maximum number of GS Prims Per Subgroup that will fit
+       * into LDS, capped by the maximum that the hardware can support.
+       */
+      gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
+      assert(gs_prims > 0);
+      worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+      esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+      assert(esgs_lds_size <= max_lds_size);
+   }
+
+   /* Now calculate remaining ESGS information. */
+   if (esgs_lds_size)
+      es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
+   else
+      es_verts = max_es_verts;
+
+   /* Vertices for adjacency primitives are not always reused, so restore
+    * it for ES_VERTS_PER_SUBGRP.
+    */
+   min_es_verts = gs->gs_input_verts_per_prim;
+
+   /* For normal primitives, the VGT only checks if they are past the ES
+    * verts per subgroup after allocating a full GS primitive and if they
+    * are, kick off a new subgroup.  But if those additional ES verts are
+    * unique (e.g. not reused) we need to make sure there is enough LDS
+    * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
+    */
+   es_verts -= min_es_verts - 1;
+
+   out->es_verts_per_subgroup = es_verts;
+   out->gs_prims_per_subgroup = gs_prims;
+   out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
+   out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * gs->gs_max_out_vertices;
+   out->esgs_ring_size = 4 * esgs_lds_size;
+
+   assert(out->max_prims_per_subgroup <= max_out_prims);
  }
  
  static void si_emit_shader_gs(struct si_context *sctx)
  {
-       struct si_shader *shader = sctx->queued.named.gs->shader;
-       unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
-       if (!shader)
-               return;
-
-       /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
-        * R_028A68_VGT_GSVS_RING_OFFSET_3 */
-       radeon_opt_set_context_reg3(sctx, R_028A60_VGT_GSVS_RING_OFFSET_1,
-                                   SI_TRACKED_VGT_GSVS_RING_OFFSET_1,
-                                   shader->ctx_reg.gs.vgt_gsvs_ring_offset_1,
-                                   shader->ctx_reg.gs.vgt_gsvs_ring_offset_2,
-                                   shader->ctx_reg.gs.vgt_gsvs_ring_offset_3);
-
-       /* R_028AB0_VGT_GSVS_RING_ITEMSIZE */
-       radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE,
-                                  SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
-                                  shader->ctx_reg.gs.vgt_gsvs_ring_itemsize);
-
-       /* R_028B38_VGT_GS_MAX_VERT_OUT */
-       radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT,
-                                  SI_TRACKED_VGT_GS_MAX_VERT_OUT,
-                                  shader->ctx_reg.gs.vgt_gs_max_vert_out);
-
-       /* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1
-        * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */
-       radeon_opt_set_context_reg4(sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE,
-                                   SI_TRACKED_VGT_GS_VERT_ITEMSIZE,
-                                   shader->ctx_reg.gs.vgt_gs_vert_itemsize,
-                                   shader->ctx_reg.gs.vgt_gs_vert_itemsize_1,
-                                   shader->ctx_reg.gs.vgt_gs_vert_itemsize_2,
-                                   shader->ctx_reg.gs.vgt_gs_vert_itemsize_3);
-
-       /* R_028B90_VGT_GS_INSTANCE_CNT */
-       radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT,
-                                  SI_TRACKED_VGT_GS_INSTANCE_CNT,
-                                  shader->ctx_reg.gs.vgt_gs_instance_cnt);
-
-       if (sctx->chip_class >= GFX9) {
-               /* R_028A44_VGT_GS_ONCHIP_CNTL */
-               radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL,
-                                          SI_TRACKED_VGT_GS_ONCHIP_CNTL,
-                                          shader->ctx_reg.gs.vgt_gs_onchip_cntl);
-               /* R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP */
-               radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
-                                          SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
-                                          shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup);
-               /* R_028AAC_VGT_ESGS_RING_ITEMSIZE */
-               radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
-                                          SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
-                                          shader->ctx_reg.gs.vgt_esgs_ring_itemsize);
-
-               if (shader->key.part.gs.es->type == PIPE_SHADER_TESS_EVAL)
-                       radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
-                                                  SI_TRACKED_VGT_TF_PARAM,
-                                                  shader->vgt_tf_param);
-               if (shader->vgt_vertex_reuse_block_cntl)
-                       radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
-                                                  SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
-                                                  shader->vgt_vertex_reuse_block_cntl);
-       }
-
-       if (initial_cdw != sctx->gfx_cs->current.cdw)
-               sctx->context_roll = true;
+   struct si_shader *shader = sctx->queued.named.gs->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+   if (!shader)
+      return;
+
+   /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
+    * R_028A68_VGT_GSVS_RING_OFFSET_3 */
+   radeon_opt_set_context_reg3(
+      sctx, R_028A60_VGT_GSVS_RING_OFFSET_1, SI_TRACKED_VGT_GSVS_RING_OFFSET_1,
+      shader->ctx_reg.gs.vgt_gsvs_ring_offset_1, shader->ctx_reg.gs.vgt_gsvs_ring_offset_2,
+      shader->ctx_reg.gs.vgt_gsvs_ring_offset_3);
+
+   /* R_028AB0_VGT_GSVS_RING_ITEMSIZE */
+   radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE,
+                              SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
+                              shader->ctx_reg.gs.vgt_gsvs_ring_itemsize);
+
+   /* R_028B38_VGT_GS_MAX_VERT_OUT */
+   radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+                              shader->ctx_reg.gs.vgt_gs_max_vert_out);
+
+   /* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1
+    * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */
+   radeon_opt_set_context_reg4(
+      sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE, SI_TRACKED_VGT_GS_VERT_ITEMSIZE,
+      shader->ctx_reg.gs.vgt_gs_vert_itemsize, shader->ctx_reg.gs.vgt_gs_vert_itemsize_1,
+      shader->ctx_reg.gs.vgt_gs_vert_itemsize_2, shader->ctx_reg.gs.vgt_gs_vert_itemsize_3);
+
+   /* R_028B90_VGT_GS_INSTANCE_CNT */
+   radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
+                              shader->ctx_reg.gs.vgt_gs_instance_cnt);
+
+   if (sctx->chip_class >= GFX9) {
+      /* R_028A44_VGT_GS_ONCHIP_CNTL */
+      radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+                                 shader->ctx_reg.gs.vgt_gs_onchip_cntl);
+      /* R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP */
+      radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+                                 SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+                                 shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup);
+      /* R_028AAC_VGT_ESGS_RING_ITEMSIZE */
+      radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+                                 SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+                                 shader->ctx_reg.gs.vgt_esgs_ring_itemsize);
+
+      if (shader->key.part.gs.es->type == PIPE_SHADER_TESS_EVAL)
+         radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                                    shader->vgt_tf_param);
+      if (shader->vgt_vertex_reuse_block_cntl)
+         radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                    SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                    shader->vgt_vertex_reuse_block_cntl);
+   }
+
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
  }
  
  static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
  {
-       struct si_shader_selector *sel = shader->selector;
-       const ubyte *num_components = sel->info.num_stream_output_components;
-       unsigned gs_num_invocations = sel->gs_num_invocations;
-       struct si_pm4_state *pm4;
-       uint64_t va;
-       unsigned max_stream = sel->max_gs_stream;
-       unsigned offset;
-
-       pm4 = si_get_shader_pm4_state(shader);
-       if (!pm4)
-               return;
-
-       pm4->atom.emit = si_emit_shader_gs;
-
-       offset = num_components[0] * sel->gs_max_out_vertices;
-       shader->ctx_reg.gs.vgt_gsvs_ring_offset_1 = offset;
-
-       if (max_stream >= 1)
-               offset += num_components[1] * sel->gs_max_out_vertices;
-       shader->ctx_reg.gs.vgt_gsvs_ring_offset_2 = offset;
-
-       if (max_stream >= 2)
-               offset += num_components[2] * sel->gs_max_out_vertices;
-       shader->ctx_reg.gs.vgt_gsvs_ring_offset_3 = offset;
-
-       if (max_stream >= 3)
-               offset += num_components[3] * sel->gs_max_out_vertices;
-       shader->ctx_reg.gs.vgt_gsvs_ring_itemsize = offset;
-
-       /* The GSVS_RING_ITEMSIZE register takes 15 bits */
-       assert(offset < (1 << 15));
-
-       shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->gs_max_out_vertices;
-
-       shader->ctx_reg.gs.vgt_gs_vert_itemsize = num_components[0];
-       shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 1) ? num_components[1] : 0;
-       shader->ctx_reg.gs.vgt_gs_vert_itemsize_2 = (max_stream >= 2) ? num_components[2] : 0;
-       shader->ctx_reg.gs.vgt_gs_vert_itemsize_3 = (max_stream >= 3) ? num_components[3] : 0;
-
-       shader->ctx_reg.gs.vgt_gs_instance_cnt = S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
-                                                S_028B90_ENABLE(gs_num_invocations > 0);
-
-       va = shader->bo->gpu_address;
-       si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
-       if (sscreen->info.chip_class >= GFX9) {
-               unsigned input_prim = sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
-               unsigned es_type = shader->key.part.gs.es->type;
-               unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
-
-               if (es_type == PIPE_SHADER_VERTEX) {
-                       es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
-               } else if (es_type == PIPE_SHADER_TESS_EVAL)
-                       es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2;
-               else
-                       unreachable("invalid shader selector type");
-
-               /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
-                * VGPR[0:4] are always loaded.
-                */
-               if (sel->info.uses_invocationid)
-                       gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
-               else if (sel->info.uses_primid)
-                       gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
-               else if (input_prim >= PIPE_PRIM_TRIANGLES)
-                       gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
-               else
-                       gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
-
-               unsigned num_user_sgprs;
-               if (es_type == PIPE_SHADER_VERTEX)
-                       num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
-               else
-                       num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
-
-               if (sscreen->info.chip_class >= GFX10) {
-                       si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
-                       si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
-               } else {
-                       si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
-                       si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40));
-               }
-
-               uint32_t rsrc1 =
-                       S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
-                       S_00B228_DX10_CLAMP(1) |
-                       S_00B228_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
-                       S_00B228_WGP_MODE(sscreen->info.chip_class >= GFX10) |
-                       S_00B228_FLOAT_MODE(shader->config.float_mode) |
-                       S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
-               uint32_t rsrc2 =
-                       S_00B22C_USER_SGPR(num_user_sgprs) |
-                       S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
-                       S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
-                       S_00B22C_LDS_SIZE(shader->config.lds_size) |
-                       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
-
-               if (sscreen->info.chip_class >= GFX10) {
-                       rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
-               } else {
-                       rsrc1 |= S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8);
-                       rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
-               }
-
-               si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
-               si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
-
-               if (sscreen->info.chip_class >= GFX10) {
-                       si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-                                      S_00B204_CU_EN(0xffff) |
-                                      S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
-               }
-
-               shader->ctx_reg.gs.vgt_gs_onchip_cntl =
-                       S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
-                       S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) |
-                       S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->gs_info.gs_inst_prims_in_subgroup);
-               shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup =
-                       S_028A94_MAX_PRIMS_PER_SUBGROUP(shader->gs_info.max_prims_per_subgroup);
-               shader->ctx_reg.gs.vgt_esgs_ring_itemsize =
-                       shader->key.part.gs.es->esgs_itemsize / 4;
-
-               if (es_type == PIPE_SHADER_TESS_EVAL)
-                       si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4);
-
-               polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es,
-                                            NULL, pm4);
-       } else {
-               si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
-               si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40));
-
-               si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
-                              S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
-                              S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
-                              S_00B228_DX10_CLAMP(1) |
-                              S_00B228_FLOAT_MODE(shader->config.float_mode));
-               si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
-                              S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
-                              S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
-       }
+   struct si_shader_selector *sel = shader->selector;
+   const ubyte *num_components = sel->info.num_stream_output_components;
+   unsigned gs_num_invocations = sel->gs_num_invocations;
+   struct si_pm4_state *pm4;
+   uint64_t va;
+   unsigned max_stream = sel->max_gs_stream;
+   unsigned offset;
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   pm4->atom.emit = si_emit_shader_gs;
+
+   offset = num_components[0] * sel->gs_max_out_vertices;
+   shader->ctx_reg.gs.vgt_gsvs_ring_offset_1 = offset;
+
+   if (max_stream >= 1)
+      offset += num_components[1] * sel->gs_max_out_vertices;
+   shader->ctx_reg.gs.vgt_gsvs_ring_offset_2 = offset;
+
+   if (max_stream >= 2)
+      offset += num_components[2] * sel->gs_max_out_vertices;
+   shader->ctx_reg.gs.vgt_gsvs_ring_offset_3 = offset;
+
+   if (max_stream >= 3)
+      offset += num_components[3] * sel->gs_max_out_vertices;
+   shader->ctx_reg.gs.vgt_gsvs_ring_itemsize = offset;
+
+   /* The GSVS_RING_ITEMSIZE register takes 15 bits */
+   assert(offset < (1 << 15));
+
+   shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->gs_max_out_vertices;
+
+   shader->ctx_reg.gs.vgt_gs_vert_itemsize = num_components[0];
+   shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 1) ? num_components[1] : 0;
+   shader->ctx_reg.gs.vgt_gs_vert_itemsize_2 = (max_stream >= 2) ? num_components[2] : 0;
+   shader->ctx_reg.gs.vgt_gs_vert_itemsize_3 = (max_stream >= 3) ? num_components[3] : 0;
+
+   shader->ctx_reg.gs.vgt_gs_instance_cnt =
+      S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);
+
+   va = shader->bo->gpu_address;
+   si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+   if (sscreen->info.chip_class >= GFX9) {
+      unsigned input_prim = sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
+      unsigned es_type = shader->key.part.gs.es->type;
+      unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
+
+      if (es_type == PIPE_SHADER_VERTEX) {
+         es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
+      } else if (es_type == PIPE_SHADER_TESS_EVAL)
+         es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2;
+      else
+         unreachable("invalid shader selector type");
+
+      /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
+       * VGPR[0:4] are always loaded.
+       */
+      if (sel->info.uses_invocationid)
+         gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
+      else if (sel->info.uses_primid)
+         gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
+      else if (input_prim >= PIPE_PRIM_TRIANGLES)
+         gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
+      else
+         gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+
+      unsigned num_user_sgprs;
+      if (es_type == PIPE_SHADER_VERTEX)
+         num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
+      else
+         num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+
+      if (sscreen->info.chip_class >= GFX10) {
+         si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+         si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
+      } else {
+         si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
+         si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40));
+      }
+
+      uint32_t rsrc1 = S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_DX10_CLAMP(1) |
+                       S_00B228_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+                       S_00B228_WGP_MODE(sscreen->info.chip_class >= GFX10) |
+                       S_00B228_FLOAT_MODE(shader->config.float_mode) |
+                       S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
+      uint32_t rsrc2 = S_00B22C_USER_SGPR(num_user_sgprs) |
+                       S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+                       S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
+                       S_00B22C_LDS_SIZE(shader->config.lds_size) |
+                       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+
+      if (sscreen->info.chip_class >= GFX10) {
+         rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+      } else {
+         rsrc1 |= S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8);
+         rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+      }
+
+      si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
+      si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
+
+      if (sscreen->info.chip_class >= GFX10) {
+         si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+                        S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
+      }
+
+      shader->ctx_reg.gs.vgt_gs_onchip_cntl =
+         S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
+         S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) |
+         S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->gs_info.gs_inst_prims_in_subgroup);
+      shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup =
+         S_028A94_MAX_PRIMS_PER_SUBGROUP(shader->gs_info.max_prims_per_subgroup);
+      shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.part.gs.es->esgs_itemsize / 4;
+
+      if (es_type == PIPE_SHADER_TESS_EVAL)
+         si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4);
+
+      polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, NULL, pm4);
+   } else {
+      si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
+      si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40));
+
+      si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+                     S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
+                        S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
+                        S_00B228_DX10_CLAMP(1) | S_00B228_FLOAT_MODE(shader->config.float_mode));
+      si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+                     S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
+                        S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+   }
  }
  
  static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value)
  {
-       enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC;
+   enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC;
  
-       if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
-           sctx->tracked_regs.reg_value[reg] != value) {
-               struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+       sctx->tracked_regs.reg_value[reg] != value) {
+      struct radeon_cmdbuf *cs = sctx->gfx_cs;
  
-               if (sctx->family == CHIP_NAVI10 ||
-                   sctx->family == CHIP_NAVI12 ||
-                   sctx->family == CHIP_NAVI14) {
-                       /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
-                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-                       radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
-               }
+      if (sctx->family == CHIP_NAVI10 || sctx->family == CHIP_NAVI12 ||
+          sctx->family == CHIP_NAVI14) {
+         /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
+      }
  
-               radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value);
+      radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value);
  
-               sctx->tracked_regs.reg_saved |= 0x1ull << reg;
-               sctx->tracked_regs.reg_value[reg] = value;
-       }
+      sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+      sctx->tracked_regs.reg_value[reg] = value;
+   }
  }
  
  /* Common tail code for NGG primitive shaders. */
-static void gfx10_emit_shader_ngg_tail(struct si_context *sctx,
-                                      struct si_shader *shader,
-                                      unsigned initial_cdw)
-{
-       radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
-                                  SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
-                                  shader->ctx_reg.ngg.ge_max_output_per_subgroup);
-       radeon_opt_set_context_reg(sctx, R_028B4C_GE_NGG_SUBGRP_CNTL,
-                                  SI_TRACKED_GE_NGG_SUBGRP_CNTL,
-                                  shader->ctx_reg.ngg.ge_ngg_subgrp_cntl);
-       radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN,
-                                  SI_TRACKED_VGT_PRIMITIVEID_EN,
-                                  shader->ctx_reg.ngg.vgt_primitiveid_en);
-       radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL,
-                                  SI_TRACKED_VGT_GS_ONCHIP_CNTL,
-                                  shader->ctx_reg.ngg.vgt_gs_onchip_cntl);
-       radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT,
-                                  SI_TRACKED_VGT_GS_INSTANCE_CNT,
-                                  shader->ctx_reg.ngg.vgt_gs_instance_cnt);
-       radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
-                                  SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
-                                  shader->ctx_reg.ngg.vgt_esgs_ring_itemsize);
-       radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG,
-                                  SI_TRACKED_SPI_VS_OUT_CONFIG,
-                                  shader->ctx_reg.ngg.spi_vs_out_config);
-       radeon_opt_set_context_reg2(sctx, R_028708_SPI_SHADER_IDX_FORMAT,
-                                  SI_TRACKED_SPI_SHADER_IDX_FORMAT,
-                                  shader->ctx_reg.ngg.spi_shader_idx_format,
-                                  shader->ctx_reg.ngg.spi_shader_pos_format);
-       radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL,
-                                  SI_TRACKED_PA_CL_VTE_CNTL,
-                                  shader->ctx_reg.ngg.pa_cl_vte_cntl);
-       radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL,
-                                  SI_TRACKED_PA_CL_NGG_CNTL,
-                                  shader->ctx_reg.ngg.pa_cl_ngg_cntl);
-
-       radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
-                                      SI_TRACKED_PA_CL_VS_OUT_CNTL__VS,
-                                      shader->pa_cl_vs_out_cntl,
-                                      SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
-
-       if (initial_cdw != sctx->gfx_cs->current.cdw)
-               sctx->context_roll = true;
-
-       /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
-       gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc);
+static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader *shader,
+                                       unsigned initial_cdw)
+{
+   radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
+                              SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
+                              shader->ctx_reg.ngg.ge_max_output_per_subgroup);
+   radeon_opt_set_context_reg(sctx, R_028B4C_GE_NGG_SUBGRP_CNTL, SI_TRACKED_GE_NGG_SUBGRP_CNTL,
+                              shader->ctx_reg.ngg.ge_ngg_subgrp_cntl);
+   radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
+                              shader->ctx_reg.ngg.vgt_primitiveid_en);
+   radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+                              shader->ctx_reg.ngg.vgt_gs_onchip_cntl);
+   radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
+                              shader->ctx_reg.ngg.vgt_gs_instance_cnt);
+   radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+                              SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+                              shader->ctx_reg.ngg.vgt_esgs_ring_itemsize);
+   radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
+                              shader->ctx_reg.ngg.spi_vs_out_config);
+   radeon_opt_set_context_reg2(
+      sctx, R_028708_SPI_SHADER_IDX_FORMAT, SI_TRACKED_SPI_SHADER_IDX_FORMAT,
+      shader->ctx_reg.ngg.spi_shader_idx_format, shader->ctx_reg.ngg.spi_shader_pos_format);
+   radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
+                              shader->ctx_reg.ngg.pa_cl_vte_cntl);
+   radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL,
+                              shader->ctx_reg.ngg.pa_cl_ngg_cntl);
+
+   radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
+                                  SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
+                                  SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
+
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
+
+   /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
+   gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc);
  }
  
  static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
  {
-       struct si_shader *shader = sctx->queued.named.gs->shader;
-       unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   struct si_shader *shader = sctx->queued.named.gs->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
  
-       if (!shader)
-               return;
+   if (!shader)
+      return;
  
-       gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+   gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
  }
  
  static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx)
  {
-       struct si_shader *shader = sctx->queued.named.gs->shader;
-       unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   struct si_shader *shader = sctx->queued.named.gs->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
  
-       if (!shader)
-               return;
+   if (!shader)
+      return;
  
-       radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
-                                  SI_TRACKED_VGT_TF_PARAM,
-                                  shader->vgt_tf_param);
+   radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                              shader->vgt_tf_param);
  
-       gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+   gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
  }
  
  static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx)
  {
-       struct si_shader *shader = sctx->queued.named.gs->shader;
-       unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   struct si_shader *shader = sctx->queued.named.gs->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
  
-       if (!shader)
-               return;
+   if (!shader)
+      return;
  
-       radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT,
-                                  SI_TRACKED_VGT_GS_MAX_VERT_OUT,
-                                  shader->ctx_reg.ngg.vgt_gs_max_vert_out);
+   radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+                              shader->ctx_reg.ngg.vgt_gs_max_vert_out);
  
-       gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+   gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
  }
  
  static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx)
  {
-       struct si_shader *shader = sctx->queued.named.gs->shader;
-       unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   struct si_shader *shader = sctx->queued.named.gs->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
  
-       if (!shader)
-               return;
+   if (!shader)
+      return;
  
-       radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT,
-                                  SI_TRACKED_VGT_GS_MAX_VERT_OUT,
-                                  shader->ctx_reg.ngg.vgt_gs_max_vert_out);
-       radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
-                                  SI_TRACKED_VGT_TF_PARAM,
-                                  shader->vgt_tf_param);
+   radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+                              shader->ctx_reg.ngg.vgt_gs_max_vert_out);
+   radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                              shader->vgt_tf_param);
  
-       gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+   gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
  }
  
  unsigned si_get_input_prim(const struct si_shader_selector *gs)
  {
-       if (gs->type == PIPE_SHADER_GEOMETRY)
-               return gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
-
-       if (gs->type == PIPE_SHADER_TESS_EVAL) {
-               if (gs->info.properties[TGSI_PROPERTY_TES_POINT_MODE])
-                       return PIPE_PRIM_POINTS;
-               if (gs->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
-                       return PIPE_PRIM_LINES;
-               return PIPE_PRIM_TRIANGLES;
-       }
-
-       /* TODO: Set this correctly if the primitive type is set in the shader key. */
-       return PIPE_PRIM_TRIANGLES; /* worst case for all callers */
+   if (gs->type == PIPE_SHADER_GEOMETRY)
+      return gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
+
+   if (gs->type == PIPE_SHADER_TESS_EVAL) {
+      if (gs->info.properties[TGSI_PROPERTY_TES_POINT_MODE])
+         return PIPE_PRIM_POINTS;
+      if (gs->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
+         return PIPE_PRIM_LINES;
+      return PIPE_PRIM_TRIANGLES;
+   }
+
+   /* TODO: Set this correctly if the primitive type is set in the shader key. */
+   return PIPE_PRIM_TRIANGLES; /* worst case for all callers */
  }
  
  static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel, bool ngg)
  {
-       bool misc_vec_ena =
-               sel->info.writes_psize || (sel->info.writes_edgeflag && !ngg) ||
-               sel->info.writes_layer || sel->info.writes_viewport_index;
-       return S_02881C_USE_VTX_POINT_SIZE(sel->info.writes_psize) |
-              S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) |
-              S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) |
-              S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) |
-              S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
-              S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena);
+   bool misc_vec_ena = sel->info.writes_psize || (sel->info.writes_edgeflag && !ngg) ||
+                       sel->info.writes_layer || sel->info.writes_viewport_index;
+   return S_02881C_USE_VTX_POINT_SIZE(sel->info.writes_psize) |
+          S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) |
+          S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) |
+          S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) |
+          S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
+          S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena);
  }
  
  /**
@@ -1136,305 +1061,279 @@ static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel, bool ng
   */
  static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader)
  {
-       const struct si_shader_selector *gs_sel = shader->selector;
-       const struct si_shader_info *gs_info = &gs_sel->info;
-       enum pipe_shader_type gs_type = shader->selector->type;
-       const struct si_shader_selector *es_sel =
-               shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
-       const struct si_shader_info *es_info = &es_sel->info;
-       enum pipe_shader_type es_type = es_sel->type;
-       unsigned num_user_sgprs;
-       unsigned nparams, es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
-       uint64_t va;
-       unsigned window_space =
-               gs_info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-       bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid;
-       unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
-       unsigned input_prim = si_get_input_prim(gs_sel);
-       bool break_wave_at_eoi = false;
-       struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader);
-       if (!pm4)
-               return;
-
-       if (es_type == PIPE_SHADER_TESS_EVAL) {
-               pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_tess_gs
-                                                                : gfx10_emit_shader_ngg_tess_nogs;
-       } else {
-               pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_notess_gs
-                                                                : gfx10_emit_shader_ngg_notess_nogs;
-       }
-
-       va = shader->bo->gpu_address;
-       si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
-       if (es_type == PIPE_SHADER_VERTEX) {
-               es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
-
-               if (es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
-                       num_user_sgprs = SI_SGPR_VS_BLIT_DATA +
-                                        es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
-               } else {
-                       num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
-               }
-       } else {
-               assert(es_type == PIPE_SHADER_TESS_EVAL);
-               es_vgpr_comp_cnt = es_enable_prim_id ? 3 : 2;
-               num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
-
-               if (es_enable_prim_id || gs_info->uses_primid)
-                       break_wave_at_eoi = true;
-       }
-
-       /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
-        * VGPR[0:4] are always loaded.
-        *
-        * Vertex shaders always need to load VGPR3, because they need to
-        * pass edge flags for decomposed primitives (such as quads) to the PA
-        * for the GL_LINE polygon mode to skip rendering lines on inner edges.
-        */
-       if (gs_info->uses_invocationid ||
-           (gs_type == PIPE_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader)))
-               gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */
-       else if ((gs_type == PIPE_SHADER_GEOMETRY && gs_info->uses_primid) ||
-                (gs_type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
-               gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
-       else if (input_prim >= PIPE_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader))
-               gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
-       else
-               gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
-
-       si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
-       si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
-       si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
-                      S_00B228_VGPRS((shader->config.num_vgprs - 1) /
-                                     (sscreen->ge_wave_size == 32 ? 8 : 4)) |
-                      S_00B228_FLOAT_MODE(shader->config.float_mode) |
-                      S_00B228_DX10_CLAMP(1) |
-                      S_00B228_MEM_ORDERED(1) |
-                      S_00B228_WGP_MODE(1) |
-                      S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
-       si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
-                      S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
-                      S_00B22C_USER_SGPR(num_user_sgprs) |
-                      S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
-                      S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) |
-                      S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
-                      S_00B22C_LDS_SIZE(shader->config.lds_size));
-
-       /* Determine LATE_ALLOC_GS. */
-       unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
-       unsigned late_alloc_wave64; /* The limit is per SH. */
-
-       /* For Wave32, the hw will launch twice the number of late
-        * alloc waves, so 1 == 2x wave32.
-        *
-        * Don't use late alloc for NGG on Navi14 due to a hw bug.
-        */
-       if (sscreen->info.family == CHIP_NAVI14 || !sscreen->info.use_late_alloc)
-               late_alloc_wave64 = 0;
-       else if (num_cu_per_sh <= 6)
-               late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
-       else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
-               late_alloc_wave64 = (num_cu_per_sh - 2) * 6;
-       else
-               late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-
-       /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
-       if (sscreen->info.family == CHIP_NAVI10 ||
-           sscreen->info.family == CHIP_NAVI12 ||
-           sscreen->info.family == CHIP_NAVI14)
-               late_alloc_wave64 = MIN2(late_alloc_wave64, 64);
-
-       si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-                      S_00B204_CU_EN(0xffff) |
-                      S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
-
-       nparams = MAX2(shader->info.nr_param_exports, 1);
-       shader->ctx_reg.ngg.spi_vs_out_config =
-               S_0286C4_VS_EXPORT_COUNT(nparams - 1) |
-               S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
-
-       shader->ctx_reg.ngg.spi_shader_idx_format =
-               S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP);
-       shader->ctx_reg.ngg.spi_shader_pos_format =
-               S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
-               S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ?
-                                           V_02870C_SPI_SHADER_4COMP :
-                                           V_02870C_SPI_SHADER_NONE) |
-               S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ?
-                                           V_02870C_SPI_SHADER_4COMP :
-                                           V_02870C_SPI_SHADER_NONE) |
-               S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ?
-                                           V_02870C_SPI_SHADER_4COMP :
-                                           V_02870C_SPI_SHADER_NONE);
-
-       shader->ctx_reg.ngg.vgt_primitiveid_en =
-               S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
-               S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.mono.u.vs_export_prim_id ||
-                                                 gs_sel->info.writes_primid);
-
-       if (gs_type == PIPE_SHADER_GEOMETRY) {
-               shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4;
-               shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->gs_max_out_vertices;
-       } else {
-               shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1;
-       }
-
-       if (es_type == PIPE_SHADER_TESS_EVAL)
-               si_set_tesseval_regs(sscreen, es_sel, pm4);
-
-       shader->ctx_reg.ngg.vgt_gs_onchip_cntl =
-               S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
-               S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
-               S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations);
-       shader->ctx_reg.ngg.ge_max_output_per_subgroup =
-               S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts);
-       shader->ctx_reg.ngg.ge_ngg_subgrp_cntl =
-               S_028B4C_PRIM_AMP_FACTOR(shader->ngg.prim_amp_factor) |
-               S_028B4C_THDS_PER_SUBGRP(0); /* for fast launch */
-       shader->ctx_reg.ngg.vgt_gs_instance_cnt =
-               S_028B90_CNT(gs_num_invocations) |
-               S_028B90_ENABLE(gs_num_invocations > 1) |
-               S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(
-                       shader->ngg.max_vert_out_per_gs_instance);
-
-       /* Always output hw-generated edge flags and pass them via the prim
-        * export to prevent drawing lines on internal edges of decomposed
-        * primitives (such as quads) with polygon mode = lines. Only VS needs
-        * this.
-        */
-       shader->ctx_reg.ngg.pa_cl_ngg_cntl =
-               S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_type == PIPE_SHADER_VERTEX);
-       shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(gs_sel, true);
-
-       /* Oversubscribe PC. This improves performance when there are too many varyings. */
-       float oversub_pc_factor = 0.25;
-
-       if (shader->key.opt.ngg_culling) {
-               /* Be more aggressive with NGG culling. */
-               if (shader->info.nr_param_exports > 4)
-                       oversub_pc_factor = 1;
-               else if (shader->info.nr_param_exports > 2)
-                       oversub_pc_factor = 0.75;
-               else
-                       oversub_pc_factor = 0.5;
-       }
-
-       unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor;
-       shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
-                                         S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
-
-       if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
-               shader->ge_cntl =
-                       S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
-                       S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
-       } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
-               shader->ge_cntl =
-                       S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
-                       S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
-       } else {
-               shader->ge_cntl =
-                       S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
-                       S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
-                       S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
-
-               /* Bug workaround for a possible hang with non-tessellation cases.
-                * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
-                *
-                * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
-                */
-               if ((sscreen->info.family == CHIP_NAVI10 ||
-                    sscreen->info.family == CHIP_NAVI12 ||
-                    sscreen->info.family == CHIP_NAVI14) &&
-                   (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
-                   shader->ngg.hw_max_esverts != 256) {
-                       shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
-
-                       if (shader->ngg.hw_max_esverts > 5) {
-                               shader->ge_cntl |=
-                                       S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
-                       }
-               }
-       }
-
-       if (window_space) {
-               shader->ctx_reg.ngg.pa_cl_vte_cntl =
-                       S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
-       } else {
-               shader->ctx_reg.ngg.pa_cl_vte_cntl =
-                       S_028818_VTX_W0_FMT(1) |
-                       S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
-                       S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
-                       S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
-       }
+   const struct si_shader_selector *gs_sel = shader->selector;
+   const struct si_shader_info *gs_info = &gs_sel->info;
+   enum pipe_shader_type gs_type = shader->selector->type;
+   const struct si_shader_selector *es_sel =
+      shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
+   const struct si_shader_info *es_info = &es_sel->info;
+   enum pipe_shader_type es_type = es_sel->type;
+   unsigned num_user_sgprs;
+   unsigned nparams, es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
+   uint64_t va;
+   unsigned window_space = gs_info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+   bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid;
+   unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
+   unsigned input_prim = si_get_input_prim(gs_sel);
+   bool break_wave_at_eoi = false;
+   struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   if (es_type == PIPE_SHADER_TESS_EVAL) {
+      pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_tess_gs
+                                                       : gfx10_emit_shader_ngg_tess_nogs;
+   } else {
+      pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_notess_gs
+                                                       : gfx10_emit_shader_ngg_notess_nogs;
+   }
+
+   va = shader->bo->gpu_address;
+   si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+   if (es_type == PIPE_SHADER_VERTEX) {
+      es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
+
+      if (es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
+         num_user_sgprs =
+            SI_SGPR_VS_BLIT_DATA + es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+      } else {
+         num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
+      }
+   } else {
+      assert(es_type == PIPE_SHADER_TESS_EVAL);
+      es_vgpr_comp_cnt = es_enable_prim_id ? 3 : 2;
+      num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+
+      if (es_enable_prim_id || gs_info->uses_primid)
+         break_wave_at_eoi = true;
+   }
+
+   /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
+    * VGPR[0:4] are always loaded.
+    *
+    * Vertex shaders always need to load VGPR3, because they need to
+    * pass edge flags for decomposed primitives (such as quads) to the PA
+    * for the GL_LINE polygon mode to skip rendering lines on inner edges.
+    */
+   if (gs_info->uses_invocationid ||
+       (gs_type == PIPE_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader)))
+      gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */
+   else if ((gs_type == PIPE_SHADER_GEOMETRY && gs_info->uses_primid) ||
+            (gs_type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
+      gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
+   else if (input_prim >= PIPE_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader))
+      gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
+   else
+      gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+
+   si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+   si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
+   si_pm4_set_reg(
+      pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+      S_00B228_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
+         S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) |
+         S_00B228_MEM_ORDERED(1) | S_00B228_WGP_MODE(1) |
+         S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
+   si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+                  S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
+                     S_00B22C_USER_SGPR(num_user_sgprs) |
+                     S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+                     S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) |
+                     S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
+                     S_00B22C_LDS_SIZE(shader->config.lds_size));
+
+   /* Determine LATE_ALLOC_GS. */
+   unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
+   unsigned late_alloc_wave64; /* The limit is per SH. */
+
+   /* For Wave32, the hw will launch twice the number of late
+    * alloc waves, so 1 == 2x wave32.
+    *
+    * Don't use late alloc for NGG on Navi14 due to a hw bug.
+    */
+   if (sscreen->info.family == CHIP_NAVI14 || !sscreen->info.use_late_alloc)
+      late_alloc_wave64 = 0;
+   else if (num_cu_per_sh <= 6)
+      late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
+   else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
+      late_alloc_wave64 = (num_cu_per_sh - 2) * 6;
+   else
+      late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+
+   /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
+   if (sscreen->info.family == CHIP_NAVI10 || sscreen->info.family == CHIP_NAVI12 ||
+       sscreen->info.family == CHIP_NAVI14)
+      late_alloc_wave64 = MIN2(late_alloc_wave64, 64);
+
+   si_pm4_set_reg(
+      pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+      S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
+
+   nparams = MAX2(shader->info.nr_param_exports, 1);
+   shader->ctx_reg.ngg.spi_vs_out_config =
+      S_0286C4_VS_EXPORT_COUNT(nparams - 1) |
+      S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
+
+   shader->ctx_reg.ngg.spi_shader_idx_format =
+      S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP);
+   shader->ctx_reg.ngg.spi_shader_pos_format =
+      S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+      S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE) |
+      S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE) |
+      S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE);
+
+   shader->ctx_reg.ngg.vgt_primitiveid_en =
+      S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
+      S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.mono.u.vs_export_prim_id ||
+                                        gs_sel->info.writes_primid);
+
+   if (gs_type == PIPE_SHADER_GEOMETRY) {
+      shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4;
+      shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->gs_max_out_vertices;
+   } else {
+      shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1;
+   }
+
+   if (es_type == PIPE_SHADER_TESS_EVAL)
+      si_set_tesseval_regs(sscreen, es_sel, pm4);
+
+   shader->ctx_reg.ngg.vgt_gs_onchip_cntl =
+      S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
+      S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
+      S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations);
+   shader->ctx_reg.ngg.ge_max_output_per_subgroup =
+      S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts);
+   shader->ctx_reg.ngg.ge_ngg_subgrp_cntl = S_028B4C_PRIM_AMP_FACTOR(shader->ngg.prim_amp_factor) |
+                                            S_028B4C_THDS_PER_SUBGRP(0); /* for fast launch */
+   shader->ctx_reg.ngg.vgt_gs_instance_cnt =
+      S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
+      S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.max_vert_out_per_gs_instance);
+
+   /* Always output hw-generated edge flags and pass them via the prim
+    * export to prevent drawing lines on internal edges of decomposed
+    * primitives (such as quads) with polygon mode = lines. Only VS needs
+    * this.
+    */
+   shader->ctx_reg.ngg.pa_cl_ngg_cntl =
+      S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_type == PIPE_SHADER_VERTEX);
+   shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(gs_sel, true);
+
+   /* Oversubscribe PC. This improves performance when there are too many varyings. */
+   float oversub_pc_factor = 0.25;
+
+   if (shader->key.opt.ngg_culling) {
+      /* Be more aggressive with NGG culling. */
+      if (shader->info.nr_param_exports > 4)
+         oversub_pc_factor = 1;
+      else if (shader->info.nr_param_exports > 2)
+         oversub_pc_factor = 0.75;
+      else
+         oversub_pc_factor = 0.5;
+   }
+
+   unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor;
+   shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
+                                     S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
+
+   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
+      shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                        S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
+   } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
+      shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                        S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
+   } else {
+      shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                        S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
+                        S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
+
+      /* Bug workaround for a possible hang with non-tessellation cases.
+       * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
+       *
+       * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
+       */
+      if ((sscreen->info.family == CHIP_NAVI10 || sscreen->info.family == CHIP_NAVI12 ||
+           sscreen->info.family == CHIP_NAVI14) &&
+          (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
+          shader->ngg.hw_max_esverts != 256) {
+         shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
+
+         if (shader->ngg.hw_max_esverts > 5) {
+            shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
+         }
+      }
+   }
+
+   if (window_space) {
+      shader->ctx_reg.ngg.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
+   } else {
+      shader->ctx_reg.ngg.pa_cl_vte_cntl =
+         S_028818_VTX_W0_FMT(1) | S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
+         S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
+         S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
+   }
  }
  
  static void si_emit_shader_vs(struct si_context *sctx)
  {
-       struct si_shader *shader = sctx->queued.named.vs->shader;
-       unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
-       if (!shader)
-               return;
-
-       radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE,
-                                  SI_TRACKED_VGT_GS_MODE,
-                                  shader->ctx_reg.vs.vgt_gs_mode);
-       radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN,
-                                  SI_TRACKED_VGT_PRIMITIVEID_EN,
-                                  shader->ctx_reg.vs.vgt_primitiveid_en);
-
-       if (sctx->chip_class <= GFX8) {
-               radeon_opt_set_context_reg(sctx, R_028AB4_VGT_REUSE_OFF,
-                                          SI_TRACKED_VGT_REUSE_OFF,
-                                          shader->ctx_reg.vs.vgt_reuse_off);
-       }
-
-       radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG,
-                                  SI_TRACKED_SPI_VS_OUT_CONFIG,
-                                  shader->ctx_reg.vs.spi_vs_out_config);
-
-       radeon_opt_set_context_reg(sctx, R_02870C_SPI_SHADER_POS_FORMAT,
-                                  SI_TRACKED_SPI_SHADER_POS_FORMAT,
-                                  shader->ctx_reg.vs.spi_shader_pos_format);
-
-       radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL,
-                                  SI_TRACKED_PA_CL_VTE_CNTL,
-                                  shader->ctx_reg.vs.pa_cl_vte_cntl);
-
-       if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
-               radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
-                                          SI_TRACKED_VGT_TF_PARAM,
-                                          shader->vgt_tf_param);
-
-       if (shader->vgt_vertex_reuse_block_cntl)
-               radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
-                                          SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
-                                          shader->vgt_vertex_reuse_block_cntl);
-
-       /* Required programming for tessellation. (legacy pipeline only) */
-       if (sctx->chip_class == GFX10 &&
-           shader->selector->type == PIPE_SHADER_TESS_EVAL) {
-               radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL,
-                                          SI_TRACKED_VGT_GS_ONCHIP_CNTL,
-                                          S_028A44_ES_VERTS_PER_SUBGRP(250) |
-                                          S_028A44_GS_PRIMS_PER_SUBGRP(126) |
-                                          S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
-       }
-
-       if (sctx->chip_class >= GFX10) {
-               radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
-                                              SI_TRACKED_PA_CL_VS_OUT_CNTL__VS,
-                                              shader->pa_cl_vs_out_cntl,
-                                              SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
-       }
-
-       if (initial_cdw != sctx->gfx_cs->current.cdw)
-               sctx->context_roll = true;
-
-       /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
-       if (sctx->chip_class >= GFX10)
-               gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc);
+   struct si_shader *shader = sctx->queued.named.vs->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+   if (!shader)
+      return;
+
+   radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE, SI_TRACKED_VGT_GS_MODE,
+                              shader->ctx_reg.vs.vgt_gs_mode);
+   radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
+                              shader->ctx_reg.vs.vgt_primitiveid_en);
+
+   if (sctx->chip_class <= GFX8) {
+      radeon_opt_set_context_reg(sctx, R_028AB4_VGT_REUSE_OFF, SI_TRACKED_VGT_REUSE_OFF,
+                                 shader->ctx_reg.vs.vgt_reuse_off);
+   }
+
+   radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
+                              shader->ctx_reg.vs.spi_vs_out_config);
+
+   radeon_opt_set_context_reg(sctx, R_02870C_SPI_SHADER_POS_FORMAT,
+                              SI_TRACKED_SPI_SHADER_POS_FORMAT,
+                              shader->ctx_reg.vs.spi_shader_pos_format);
+
+   radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
+                              shader->ctx_reg.vs.pa_cl_vte_cntl);
+
+   if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+      radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                                 shader->vgt_tf_param);
+
+   if (shader->vgt_vertex_reuse_block_cntl)
+      radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                 SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                 shader->vgt_vertex_reuse_block_cntl);
+
+   /* Required programming for tessellation. (legacy pipeline only) */
+   if (sctx->chip_class == GFX10 && shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+      radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+                                 S_028A44_ES_VERTS_PER_SUBGRP(250) |
+                                    S_028A44_GS_PRIMS_PER_SUBGRP(126) |
+                                    S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
+   }
+
+   if (sctx->chip_class >= GFX10) {
+      radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
+                                     SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
+                                     SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
+   }
+
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
+
+   /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
+   if (sctx->chip_class >= GFX10)
+      gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc);
  }
  
  /**
@@ -1447,827 +1346,757 @@ static void si_emit_shader_vs(struct si_context *sctx)
  static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
                           struct si_shader_selector *gs)
  {
-       const struct si_shader_info *info = &shader->selector->info;
-       struct si_pm4_state *pm4;
-       unsigned num_user_sgprs, vgpr_comp_cnt;
-       uint64_t va;
-       unsigned nparams, oc_lds_en;
-       unsigned window_space =
-               info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-       bool enable_prim_id = shader->key.mono.u.vs_export_prim_id || info->uses_primid;
-
-       pm4 = si_get_shader_pm4_state(shader);
-       if (!pm4)
-               return;
-
-       pm4->atom.emit = si_emit_shader_vs;
-
-       /* We always write VGT_GS_MODE in the VS state, because every switch
-        * between different shader pipelines involving a different GS or no
-        * GS at all involves a switch of the VS (different GS use different
-        * copy shaders). On the other hand, when the API switches from a GS to
-        * no GS and then back to the same GS used originally, the GS state is
-        * not sent again.
-        */
-       if (!gs) {
-               unsigned mode = V_028A40_GS_OFF;
-
-               /* PrimID needs GS scenario A. */
-               if (enable_prim_id)
-                       mode = V_028A40_GS_SCENARIO_A;
-
-               shader->ctx_reg.vs.vgt_gs_mode = S_028A40_MODE(mode);
-               shader->ctx_reg.vs.vgt_primitiveid_en = enable_prim_id;
-       } else {
-               shader->ctx_reg.vs.vgt_gs_mode = ac_vgt_gs_mode(gs->gs_max_out_vertices,
-                                                               sscreen->info.chip_class);
-               shader->ctx_reg.vs.vgt_primitiveid_en = 0;
-       }
-
-       if (sscreen->info.chip_class <= GFX8) {
-               /* Reuse needs to be set off if we write oViewport. */
-               shader->ctx_reg.vs.vgt_reuse_off =
-                               S_028AB4_REUSE_OFF(info->writes_viewport_index);
-       }
-
-       va = shader->bo->gpu_address;
-       si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
-       if (gs) {
-               vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
-               num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
-       } else if (shader->selector->type == PIPE_SHADER_VERTEX) {
-               vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, enable_prim_id);
-
-               if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
-                       num_user_sgprs = SI_SGPR_VS_BLIT_DATA +
-                                        info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
-               } else {
-                       num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
-               }
-       } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
-               vgpr_comp_cnt = enable_prim_id ? 3 : 2;
-               num_user_sgprs = SI_TES_NUM_USER_SGPR;
-       } else
-               unreachable("invalid shader selector type");
-
-       /* VS is required to export at least one param. */
-       nparams = MAX2(shader->info.nr_param_exports, 1);
-       shader->ctx_reg.vs.spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
-
-       if (sscreen->info.chip_class >= GFX10) {
-               shader->ctx_reg.vs.spi_vs_out_config |=
-                       S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
-       }
-
-       shader->ctx_reg.vs.spi_shader_pos_format =
-                       S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
-                       S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ?
-                                                   V_02870C_SPI_SHADER_4COMP :
-                                                   V_02870C_SPI_SHADER_NONE) |
-                       S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ?
-                                                   V_02870C_SPI_SHADER_4COMP :
-                                                   V_02870C_SPI_SHADER_NONE) |
-                       S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ?
-                                                   V_02870C_SPI_SHADER_4COMP :
-                                                   V_02870C_SPI_SHADER_NONE);
-       shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
-                                        S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1);
-       shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, false);
-
-       oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0;
-
-       si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
-       si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, S_00B124_MEM_BASE(va >> 40));
-
-       uint32_t rsrc1 = S_00B128_VGPRS((shader->config.num_vgprs - 1) /
-                                       (sscreen->ge_wave_size == 32 ? 8 : 4)) |
-                        S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) |
-                        S_00B128_DX10_CLAMP(1) |
-                        S_00B128_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
-                        S_00B128_FLOAT_MODE(shader->config.float_mode);
-       uint32_t rsrc2 = S_00B12C_USER_SGPR(num_user_sgprs) |
-                        S_00B12C_OC_LDS_EN(oc_lds_en) |
-                        S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
-
-       if (sscreen->info.chip_class >= GFX10)
-               rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
-       else if (sscreen->info.chip_class == GFX9)
-               rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
-
-       if (sscreen->info.chip_class <= GFX9)
-               rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8);
-
-       if (!sscreen->use_ngg_streamout) {
-               rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) |
-                        S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
-                        S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
-                        S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
-                        S_00B12C_SO_EN(!!shader->selector->so.num_outputs);
-       }
-
-       si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1);
-       si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, rsrc2);
-
-       if (window_space)
-               shader->ctx_reg.vs.pa_cl_vte_cntl =
-                               S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
-       else
-               shader->ctx_reg.vs.pa_cl_vte_cntl =
-                               S_028818_VTX_W0_FMT(1) |
-                               S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
-                               S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
-                               S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
-
-       if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
-               si_set_tesseval_regs(sscreen, shader->selector, pm4);
-
-       polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
+   const struct si_shader_info *info = &shader->selector->info;
+   struct si_pm4_state *pm4;
+   unsigned num_user_sgprs, vgpr_comp_cnt;
+   uint64_t va;
+   unsigned nparams, oc_lds_en;
+   unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+   bool enable_prim_id = shader->key.mono.u.vs_export_prim_id || info->uses_primid;
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   pm4->atom.emit = si_emit_shader_vs;
+
+   /* We always write VGT_GS_MODE in the VS state, because every switch
+    * between different shader pipelines involving a different GS or no
+    * GS at all involves a switch of the VS (different GS use different
+    * copy shaders). On the other hand, when the API switches from a GS to
+    * no GS and then back to the same GS used originally, the GS state is
+    * not sent again.
+    */
+   if (!gs) {
+      unsigned mode = V_028A40_GS_OFF;
+
+      /* PrimID needs GS scenario A. */
+      if (enable_prim_id)
+         mode = V_028A40_GS_SCENARIO_A;
+
+      shader->ctx_reg.vs.vgt_gs_mode = S_028A40_MODE(mode);
+      shader->ctx_reg.vs.vgt_primitiveid_en = enable_prim_id;
+   } else {
+      shader->ctx_reg.vs.vgt_gs_mode =
+         ac_vgt_gs_mode(gs->gs_max_out_vertices, sscreen->info.chip_class);
+      shader->ctx_reg.vs.vgt_primitiveid_en = 0;
+   }
+
+   if (sscreen->info.chip_class <= GFX8) {
+      /* Reuse needs to be set off if we write oViewport. */
+      shader->ctx_reg.vs.vgt_reuse_off = S_028AB4_REUSE_OFF(info->writes_viewport_index);
+   }
+
+   va = shader->bo->gpu_address;
+   si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+   if (gs) {
+      vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
+      num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
+   } else if (shader->selector->type == PIPE_SHADER_VERTEX) {
+      vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, enable_prim_id);
+
+      if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
+         num_user_sgprs = SI_SGPR_VS_BLIT_DATA + info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+      } else {
+         num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
+      }
+   } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+      vgpr_comp_cnt = enable_prim_id ? 3 : 2;
+      num_user_sgprs = SI_TES_NUM_USER_SGPR;
+   } else
+      unreachable("invalid shader selector type");
+
+   /* VS is required to export at least one param. */
+   nparams = MAX2(shader->info.nr_param_exports, 1);
+   shader->ctx_reg.vs.spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
+
+   if (sscreen->info.chip_class >= GFX10) {
+      shader->ctx_reg.vs.spi_vs_out_config |=
+         S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
+   }
+
+   shader->ctx_reg.vs.spi_shader_pos_format =
+      S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+      S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE) |
+      S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE) |
+      S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE);
+   shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
+                                    S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1);
+   shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, false);
+
+   oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0;
+
+   si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
+   si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, S_00B124_MEM_BASE(va >> 40));
+
+   uint32_t rsrc1 =
+      S_00B128_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
+      S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B128_DX10_CLAMP(1) |
+      S_00B128_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+      S_00B128_FLOAT_MODE(shader->config.float_mode);
+   uint32_t rsrc2 = S_00B12C_USER_SGPR(num_user_sgprs) | S_00B12C_OC_LDS_EN(oc_lds_en) |
+                    S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+
+   if (sscreen->info.chip_class >= GFX10)
+      rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+   else if (sscreen->info.chip_class == GFX9)
+      rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+
+   if (sscreen->info.chip_class <= GFX9)
+      rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8);
+
+   if (!sscreen->use_ngg_streamout) {
+      rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) |
+               S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
+               S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
+               S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
+               S_00B12C_SO_EN(!!shader->selector->so.num_outputs);
+   }
+
+   si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1);
+   si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, rsrc2);
+
+   if (window_space)
+      shader->ctx_reg.vs.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
+   else
+      shader->ctx_reg.vs.pa_cl_vte_cntl =
+         S_028818_VTX_W0_FMT(1) | S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
+         S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
+         S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
+
+   if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+      si_set_tesseval_regs(sscreen, shader->selector, pm4);
+
+   polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
  }
  
  static unsigned si_get_ps_num_interp(struct si_shader *ps)
  {
-       struct si_shader_info *info = &ps->selector->info;
-       unsigned num_colors = !!(info->colors_read & 0x0f) +
-                             !!(info->colors_read & 0xf0);
-       unsigned num_interp = ps->selector->info.num_inputs +
-                             (ps->key.part.ps.prolog.color_two_side ? num_colors : 0);
-
-       assert(num_interp <= 32);
-       return MIN2(num_interp, 32);
+   struct si_shader_info *info = &ps->selector->info;
+   unsigned num_colors = !!(info->colors_read & 0x0f) + !!(info->colors_read & 0xf0);
+   unsigned num_interp =
+      ps->selector->info.num_inputs + (ps->key.part.ps.prolog.color_two_side ? num_colors : 0);
+
+   assert(num_interp <= 32);
+   return MIN2(num_interp, 32);
  }
  
  static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
  {
-       unsigned value = shader->key.part.ps.epilog.spi_shader_col_format;
-       unsigned i, num_targets = (util_last_bit(value) + 3) / 4;
+   unsigned value = shader->key.part.ps.epilog.spi_shader_col_format;
+   unsigned i, num_targets = (util_last_bit(value) + 3) / 4;
  
-       /* If the i-th target format is set, all previous target formats must
-        * be non-zero to avoid hangs.
-        */
-       for (i = 0; i < num_targets; i++)
-               if (!(value & (0xf << (i * 4))))
-                       value |= V_028714_SPI_SHADER_32_R << (i * 4);
+   /* If the i-th target format is set, all previous target formats must
+    * be non-zero to avoid hangs.
+    */
+   for (i = 0; i < num_targets; i++)
+      if (!(value & (0xf << (i * 4))))
+         value |= V_028714_SPI_SHADER_32_R << (i * 4);
  
-       return value;
+   return value;
  }
  
  static void si_emit_shader_ps(struct si_context *sctx)
  {
-       struct si_shader *shader = sctx->queued.named.ps->shader;
-       unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
-       if (!shader)
-               return;
-
-       /* R_0286CC_SPI_PS_INPUT_ENA, R_0286D0_SPI_PS_INPUT_ADDR*/
-       radeon_opt_set_context_reg2(sctx, R_0286CC_SPI_PS_INPUT_ENA,
-                                   SI_TRACKED_SPI_PS_INPUT_ENA,
-                                   shader->ctx_reg.ps.spi_ps_input_ena,
-                                   shader->ctx_reg.ps.spi_ps_input_addr);
-
-       radeon_opt_set_context_reg(sctx, R_0286E0_SPI_BARYC_CNTL,
-                                  SI_TRACKED_SPI_BARYC_CNTL,
-                                  shader->ctx_reg.ps.spi_baryc_cntl);
-       radeon_opt_set_context_reg(sctx, R_0286D8_SPI_PS_IN_CONTROL,
-                                  SI_TRACKED_SPI_PS_IN_CONTROL,
-                                  shader->ctx_reg.ps.spi_ps_in_control);
-
-       /* R_028710_SPI_SHADER_Z_FORMAT, R_028714_SPI_SHADER_COL_FORMAT */
-       radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT,
-                                   SI_TRACKED_SPI_SHADER_Z_FORMAT,
-                                   shader->ctx_reg.ps.spi_shader_z_format,
-                                   shader->ctx_reg.ps.spi_shader_col_format);
-
-       radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK,
-                                  SI_TRACKED_CB_SHADER_MASK,
-                                  shader->ctx_reg.ps.cb_shader_mask);
-
-       if (initial_cdw != sctx->gfx_cs->current.cdw)
-               sctx->context_roll = true;
-}
+   struct si_shader *shader = sctx->queued.named.ps->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
  
-static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
-{
-       struct si_shader_info *info = &shader->selector->info;
-       struct si_pm4_state *pm4;
-       unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;
-       unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
-       uint64_t va;
-       unsigned input_ena = shader->config.spi_ps_input_ena;
-
-       /* we need to enable at least one of them, otherwise we hang the GPU */
-       assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
-              G_0286CC_PERSP_CENTER_ENA(input_ena) ||
-              G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
-              G_0286CC_PERSP_PULL_MODEL_ENA(input_ena) ||
-              G_0286CC_LINEAR_SAMPLE_ENA(input_ena) ||
-              G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
-              G_0286CC_LINEAR_CENTROID_ENA(input_ena) ||
-              G_0286CC_LINE_STIPPLE_TEX_ENA(input_ena));
-       /* POS_W_FLOAT_ENA requires one of the perspective weights. */
-       assert(!G_0286CC_POS_W_FLOAT_ENA(input_ena) ||
-              G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
-              G_0286CC_PERSP_CENTER_ENA(input_ena) ||
-              G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
-              G_0286CC_PERSP_PULL_MODEL_ENA(input_ena));
-
-       /* Validate interpolation optimization flags (read as implications). */
-       assert(!shader->key.part.ps.prolog.bc_optimize_for_persp ||
-              (G_0286CC_PERSP_CENTER_ENA(input_ena) &&
-               G_0286CC_PERSP_CENTROID_ENA(input_ena)));
-       assert(!shader->key.part.ps.prolog.bc_optimize_for_linear ||
-              (G_0286CC_LINEAR_CENTER_ENA(input_ena) &&
-               G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
-       assert(!shader->key.part.ps.prolog.force_persp_center_interp ||
-              (!G_0286CC_PERSP_SAMPLE_ENA(input_ena) &&
-               !G_0286CC_PERSP_CENTROID_ENA(input_ena)));
-       assert(!shader->key.part.ps.prolog.force_linear_center_interp ||
-              (!G_0286CC_LINEAR_SAMPLE_ENA(input_ena) &&
-               !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
-       assert(!shader->key.part.ps.prolog.force_persp_sample_interp ||
-              (!G_0286CC_PERSP_CENTER_ENA(input_ena) &&
-               !G_0286CC_PERSP_CENTROID_ENA(input_ena)));
-       assert(!shader->key.part.ps.prolog.force_linear_sample_interp ||
-              (!G_0286CC_LINEAR_CENTER_ENA(input_ena) &&
-               !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
-
-       /* Validate cases when the optimizations are off (read as implications). */
-       assert(shader->key.part.ps.prolog.bc_optimize_for_persp ||
-              !G_0286CC_PERSP_CENTER_ENA(input_ena) ||
-              !G_0286CC_PERSP_CENTROID_ENA(input_ena));
-       assert(shader->key.part.ps.prolog.bc_optimize_for_linear ||
-              !G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
-              !G_0286CC_LINEAR_CENTROID_ENA(input_ena));
-
-       pm4 = si_get_shader_pm4_state(shader);
-       if (!pm4)
-               return;
-
-       pm4->atom.emit = si_emit_shader_ps;
-
-       /* SPI_BARYC_CNTL.POS_FLOAT_LOCATION
-        * Possible vaules:
-        * 0 -> Position = pixel center
-        * 1 -> Position = pixel centroid
-        * 2 -> Position = at sample position
-        *
-        * From GLSL 4.5 specification, section 7.1:
-        *   "The variable gl_FragCoord is available as an input variable from
-        *    within fragment shaders and it holds the window relative coordinates
-        *    (x, y, z, 1/w) values for the fragment. If multi-sampling, this
-        *    value can be for any location within the pixel, or one of the
-        *    fragment samples. The use of centroid does not further restrict
-        *    this value to be inside the current primitive."
-        *
-        * Meaning that centroid has no effect and we can return anything within
-        * the pixel. Thus, return the value at sample position, because that's
-        * the most accurate one shaders can get.
-        */
-       spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
-
-       if (info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] ==
-           TGSI_FS_COORD_PIXEL_CENTER_INTEGER)
-               spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1);
-
-       spi_shader_col_format = si_get_spi_shader_col_format(shader);
-       cb_shader_mask = ac_get_cb_shader_mask(spi_shader_col_format);
-
-       /* Ensure that some export memory is always allocated, for two reasons:
-        *
-        * 1) Correctness: The hardware ignores the EXEC mask if no export
-        *    memory is allocated, so KILL and alpha test do not work correctly
-        *    without this.
-        * 2) Performance: Every shader needs at least a NULL export, even when
-        *    it writes no color/depth output. The NULL export instruction
-        *    stalls without this setting.
-        *
-        * Don't add this to CB_SHADER_MASK.
-        *
-        * GFX10 supports pixel shaders without exports by setting both
-        * the color and Z formats to SPI_SHADER_ZERO. The hw will skip export
-        * instructions if any are present.
-        */
-       if ((sscreen->info.chip_class <= GFX9 ||
-            info->uses_kill ||
-            shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) &&
-           !spi_shader_col_format &&
-           !info->writes_z && !info->writes_stencil && !info->writes_samplemask)
-               spi_shader_col_format = V_028714_SPI_SHADER_32_R;
-
-       shader->ctx_reg.ps.spi_ps_input_ena = input_ena;
-       shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr;
-
-       /* Set interpolation controls. */
-       spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) |
-                           S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32);
-
-       shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl;
-       shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control;
-       shader->ctx_reg.ps.spi_shader_z_format =
-                       ac_get_spi_shader_z_format(info->writes_z,
-                                                  info->writes_stencil,
-                                                  info->writes_samplemask);
-       shader->ctx_reg.ps.spi_shader_col_format = spi_shader_col_format;
-       shader->ctx_reg.ps.cb_shader_mask = cb_shader_mask;
-
-       va = shader->bo->gpu_address;
-       si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-       si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
-       si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, S_00B024_MEM_BASE(va >> 40));
-
-       uint32_t rsrc1 =
-               S_00B028_VGPRS((shader->config.num_vgprs - 1) /
-                              (sscreen->ps_wave_size == 32 ? 8 : 4)) |
-               S_00B028_DX10_CLAMP(1) |
-               S_00B028_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
-               S_00B028_FLOAT_MODE(shader->config.float_mode);
-
-       if (sscreen->info.chip_class < GFX10) {
-               rsrc1 |= S_00B028_SGPRS((shader->config.num_sgprs - 1) / 8);
-       }
-
-       si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS, rsrc1);
-       si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
-                      S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
-                      S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) |
-                      S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+   if (!shader)
+      return;
+
+   /* R_0286CC_SPI_PS_INPUT_ENA, R_0286D0_SPI_PS_INPUT_ADDR*/
+   radeon_opt_set_context_reg2(sctx, R_0286CC_SPI_PS_INPUT_ENA, SI_TRACKED_SPI_PS_INPUT_ENA,
+                               shader->ctx_reg.ps.spi_ps_input_ena,
+                               shader->ctx_reg.ps.spi_ps_input_addr);
+
+   radeon_opt_set_context_reg(sctx, R_0286E0_SPI_BARYC_CNTL, SI_TRACKED_SPI_BARYC_CNTL,
+                              shader->ctx_reg.ps.spi_baryc_cntl);
+   radeon_opt_set_context_reg(sctx, R_0286D8_SPI_PS_IN_CONTROL, SI_TRACKED_SPI_PS_IN_CONTROL,
+                              shader->ctx_reg.ps.spi_ps_in_control);
+
+   /* R_028710_SPI_SHADER_Z_FORMAT, R_028714_SPI_SHADER_COL_FORMAT */
+   radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT, SI_TRACKED_SPI_SHADER_Z_FORMAT,
+                               shader->ctx_reg.ps.spi_shader_z_format,
+                               shader->ctx_reg.ps.spi_shader_col_format);
+
+   radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK, SI_TRACKED_CB_SHADER_MASK,
+                              shader->ctx_reg.ps.cb_shader_mask);
+
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
  }
  
-static void si_shader_init_pm4_state(struct si_screen *sscreen,
-                                     struct si_shader *shader)
+static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
  {
-       switch (shader->selector->type) {
-       case PIPE_SHADER_VERTEX:
-               if (shader->key.as_ls)
-                       si_shader_ls(sscreen, shader);
-               else if (shader->key.as_es)
-                       si_shader_es(sscreen, shader);
-               else if (shader->key.as_ngg)
-                       gfx10_shader_ngg(sscreen, shader);
-               else
-                       si_shader_vs(sscreen, shader, NULL);
-               break;
-       case PIPE_SHADER_TESS_CTRL:
-               si_shader_hs(sscreen, shader);
-               break;
-       case PIPE_SHADER_TESS_EVAL:
-               if (shader->key.as_es)
-                       si_shader_es(sscreen, shader);
-               else if (shader->key.as_ngg)
-                       gfx10_shader_ngg(sscreen, shader);
-               else
-                       si_shader_vs(sscreen, shader, NULL);
-               break;
-       case PIPE_SHADER_GEOMETRY:
-               if (shader->key.as_ngg)
-                       gfx10_shader_ngg(sscreen, shader);
-               else
-                       si_shader_gs(sscreen, shader);
-               break;
-       case PIPE_SHADER_FRAGMENT:
-               si_shader_ps(sscreen, shader);
-               break;
-       default:
-               assert(0);
-       }
+   struct si_shader_info *info = &shader->selector->info;
+   struct si_pm4_state *pm4;
+   unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;
+   unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
+   uint64_t va;
+   unsigned input_ena = shader->config.spi_ps_input_ena;
+
+   /* we need to enable at least one of them, otherwise we hang the GPU */
+   assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) || G_0286CC_PERSP_CENTER_ENA(input_ena) ||
+          G_0286CC_PERSP_CENTROID_ENA(input_ena) || G_0286CC_PERSP_PULL_MODEL_ENA(input_ena) ||
+          G_0286CC_LINEAR_SAMPLE_ENA(input_ena) || G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
+          G_0286CC_LINEAR_CENTROID_ENA(input_ena) || G_0286CC_LINE_STIPPLE_TEX_ENA(input_ena));
+   /* POS_W_FLOAT_ENA requires one of the perspective weights. */
+   assert(!G_0286CC_POS_W_FLOAT_ENA(input_ena) || G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
+          G_0286CC_PERSP_CENTER_ENA(input_ena) || G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
+          G_0286CC_PERSP_PULL_MODEL_ENA(input_ena));
+
+   /* Validate interpolation optimization flags (read as implications). */
+   assert(!shader->key.part.ps.prolog.bc_optimize_for_persp ||
+          (G_0286CC_PERSP_CENTER_ENA(input_ena) && G_0286CC_PERSP_CENTROID_ENA(input_ena)));
+   assert(!shader->key.part.ps.prolog.bc_optimize_for_linear ||
+          (G_0286CC_LINEAR_CENTER_ENA(input_ena) && G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
+   assert(!shader->key.part.ps.prolog.force_persp_center_interp ||
+          (!G_0286CC_PERSP_SAMPLE_ENA(input_ena) && !G_0286CC_PERSP_CENTROID_ENA(input_ena)));
+   assert(!shader->key.part.ps.prolog.force_linear_center_interp ||
+          (!G_0286CC_LINEAR_SAMPLE_ENA(input_ena) && !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
+   assert(!shader->key.part.ps.prolog.force_persp_sample_interp ||
+          (!G_0286CC_PERSP_CENTER_ENA(input_ena) && !G_0286CC_PERSP_CENTROID_ENA(input_ena)));
+   assert(!shader->key.part.ps.prolog.force_linear_sample_interp ||
+          (!G_0286CC_LINEAR_CENTER_ENA(input_ena) && !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
+
+   /* Validate cases when the optimizations are off (read as implications). */
+   assert(shader->key.part.ps.prolog.bc_optimize_for_persp ||
+          !G_0286CC_PERSP_CENTER_ENA(input_ena) || !G_0286CC_PERSP_CENTROID_ENA(input_ena));
+   assert(shader->key.part.ps.prolog.bc_optimize_for_linear ||
+          !G_0286CC_LINEAR_CENTER_ENA(input_ena) || !G_0286CC_LINEAR_CENTROID_ENA(input_ena));
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   pm4->atom.emit = si_emit_shader_ps;
+
+   /* SPI_BARYC_CNTL.POS_FLOAT_LOCATION
+    * Possible vaules:
+    * 0 -> Position = pixel center
+    * 1 -> Position = pixel centroid
+    * 2 -> Position = at sample position
+    *
+    * From GLSL 4.5 specification, section 7.1:
+    *   "The variable gl_FragCoord is available as an input variable from
+    *    within fragment shaders and it holds the window relative coordinates
+    *    (x, y, z, 1/w) values for the fragment. If multi-sampling, this
+    *    value can be for any location within the pixel, or one of the
+    *    fragment samples. The use of centroid does not further restrict
+    *    this value to be inside the current primitive."
+    *
+    * Meaning that centroid has no effect and we can return anything within
+    * the pixel. Thus, return the value at sample position, because that's
+    * the most accurate one shaders can get.
+    */
+   spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
+
+   if (info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] == TGSI_FS_COORD_PIXEL_CENTER_INTEGER)
+      spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1);
+
+   spi_shader_col_format = si_get_spi_shader_col_format(shader);
+   cb_shader_mask = ac_get_cb_shader_mask(spi_shader_col_format);
+
+   /* Ensure that some export memory is always allocated, for two reasons:
+    *
+    * 1) Correctness: The hardware ignores the EXEC mask if no export
+    *    memory is allocated, so KILL and alpha test do not work correctly
+    *    without this.
+    * 2) Performance: Every shader needs at least a NULL export, even when
+    *    it writes no color/depth output. The NULL export instruction
+    *    stalls without this setting.
+    *
+    * Don't add this to CB_SHADER_MASK.
+    *
+    * GFX10 supports pixel shaders without exports by setting both
+    * the color and Z formats to SPI_SHADER_ZERO. The hw will skip export
+    * instructions if any are present.
+    */
+   if ((sscreen->info.chip_class <= GFX9 || info->uses_kill ||
+        shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) &&
+       !spi_shader_col_format && !info->writes_z && !info->writes_stencil &&
+       !info->writes_samplemask)
+      spi_shader_col_format = V_028714_SPI_SHADER_32_R;
+
+   shader->ctx_reg.ps.spi_ps_input_ena = input_ena;
+   shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr;
+
+   /* Set interpolation controls. */
+   spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) |
+                       S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32);
+
+   shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl;
+   shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control;
+   shader->ctx_reg.ps.spi_shader_z_format =
+      ac_get_spi_shader_z_format(info->writes_z, info->writes_stencil, info->writes_samplemask);
+   shader->ctx_reg.ps.spi_shader_col_format = spi_shader_col_format;
+   shader->ctx_reg.ps.cb_shader_mask = cb_shader_mask;
+
+   va = shader->bo->gpu_address;
+   si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+   si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
+   si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, S_00B024_MEM_BASE(va >> 40));
+
+   uint32_t rsrc1 =
+      S_00B028_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ps_wave_size == 32 ? 8 : 4)) |
+      S_00B028_DX10_CLAMP(1) | S_00B028_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+      S_00B028_FLOAT_MODE(shader->config.float_mode);
+
+   if (sscreen->info.chip_class < GFX10) {
+      rsrc1 |= S_00B028_SGPRS((shader->config.num_sgprs - 1) / 8);
+   }
+
+   si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS, rsrc1);
+   si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
+                  S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
+                     S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) |
+                     S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+}
+
+static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader *shader)
+{
+   switch (shader->selector->type) {
+   case PIPE_SHADER_VERTEX:
+      if (shader->key.as_ls)
+         si_shader_ls(sscreen, shader);
+      else if (shader->key.as_es)
+         si_shader_es(sscreen, shader);
+      else if (shader->key.as_ngg)
+         gfx10_shader_ngg(sscreen, shader);
+      else
+         si_shader_vs(sscreen, shader, NULL);
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      si_shader_hs(sscreen, shader);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      if (shader->key.as_es)
+         si_shader_es(sscreen, shader);
+      else if (shader->key.as_ngg)
+         gfx10_shader_ngg(sscreen, shader);
+      else
+         si_shader_vs(sscreen, shader, NULL);
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      if (shader->key.as_ngg)
+         gfx10_shader_ngg(sscreen, shader);
+      else
+         si_shader_gs(sscreen, shader);
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      si_shader_ps(sscreen, shader);
+      break;
+   default:
+      assert(0);
+   }
  }
  
  static unsigned si_get_alpha_test_func(struct si_context *sctx)
  {
-       /* Alpha-test should be disabled if colorbuffer 0 is integer. */
-       return sctx->queued.named.dsa->alpha_func;
+   /* Alpha-test should be disabled if colorbuffer 0 is integer. */
+   return sctx->queued.named.dsa->alpha_func;
  }
  
-void si_shader_selector_key_vs(struct si_context *sctx,
-                              struct si_shader_selector *vs,
-                              struct si_shader_key *key,
-                              struct si_vs_prolog_bits *prolog_key)
+void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs,
+                               struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key)
  {
-       if (!sctx->vertex_elements ||
-           vs->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD])
-               return;
-
-       struct si_vertex_elements *elts = sctx->vertex_elements;
-
-       prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one;
-       prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
-       prolog_key->unpack_instance_id_from_vertex_id =
-               sctx->prim_discard_cs_instancing;
-
-       /* Prefer a monolithic shader to allow scheduling divisions around
-        * VBO loads. */
-       if (prolog_key->instance_divisor_is_fetched)
-               key->opt.prefer_mono = 1;
-
-       unsigned count = MIN2(vs->info.num_inputs, elts->count);
-       unsigned count_mask = (1 << count) - 1;
-       unsigned fix = elts->fix_fetch_always & count_mask;
-       unsigned opencode = elts->fix_fetch_opencode & count_mask;
-
-       if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) {
-               uint32_t mask = elts->fix_fetch_unaligned & count_mask;
-               while (mask) {
-                       unsigned i = u_bit_scan(&mask);
-                       unsigned log_hw_load_size = 1 + ((elts->hw_load_is_dword >> i) & 1);
-                       unsigned vbidx = elts->vertex_buffer_index[i];
-                       struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbidx];
-                       unsigned align_mask = (1 << log_hw_load_size) - 1;
-                       if (vb->buffer_offset & align_mask ||
-                           vb->stride & align_mask) {
-                               fix |= 1 << i;
-                               opencode |= 1 << i;
-                       }
-               }
-       }
-
-       while (fix) {
-               unsigned i = u_bit_scan(&fix);
-               key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i];
-       }
-       key->mono.vs_fetch_opencode = opencode;
-}
+   if (!sctx->vertex_elements || vs->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD])
+      return;
  
-static void si_shader_selector_key_hw_vs(struct si_context *sctx,
-                                        struct si_shader_selector *vs,
-                                        struct si_shader_key *key)
-{
-       struct si_shader_selector *ps = sctx->ps_shader.cso;
-
-       key->opt.clip_disable =
-               sctx->queued.named.rasterizer->clip_plane_enable == 0 &&
-               (vs->info.clipdist_writemask ||
-                vs->info.writes_clipvertex) &&
-               !vs->info.culldist_writemask;
-
-       /* Find out if PS is disabled. */
-       bool ps_disabled = true;
-       if (ps) {
-               bool ps_modifies_zs = ps->info.uses_kill ||
-                                     ps->info.writes_z ||
-                                     ps->info.writes_stencil ||
-                                     ps->info.writes_samplemask ||
-                                     sctx->queued.named.blend->alpha_to_coverage ||
-                                     si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS;
-               unsigned ps_colormask = si_get_total_colormask(sctx);
-
-               ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard ||
-                             (!ps_colormask &&
-                              !ps_modifies_zs &&
-                              !ps->info.writes_memory);
-       }
-
-       /* Find out which VS outputs aren't used by the PS. */
-       uint64_t outputs_written = vs->outputs_written_before_ps;
-       uint64_t inputs_read = 0;
-
-       /* Ignore outputs that are not passed from VS to PS. */
-       outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0, true)) |
-                            (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0, true)) |
-                            (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_CLIPVERTEX, 0, true)));
-
-       if (!ps_disabled) {
-               inputs_read = ps->inputs_read;
-       }
-
-       uint64_t linked = outputs_written & inputs_read;
-
-       key->opt.kill_outputs = ~linked & outputs_written;
-       key->opt.ngg_culling = sctx->ngg_culling;
+   struct si_vertex_elements *elts = sctx->vertex_elements;
+
+   prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one;
+   prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
+   prolog_key->unpack_instance_id_from_vertex_id = sctx->prim_discard_cs_instancing;
+
+   /* Prefer a monolithic shader to allow scheduling divisions around
+    * VBO loads. */
+   if (prolog_key->instance_divisor_is_fetched)
+      key->opt.prefer_mono = 1;
+
+   unsigned count = MIN2(vs->info.num_inputs, elts->count);
+   unsigned count_mask = (1 << count) - 1;
+   unsigned fix = elts->fix_fetch_always & count_mask;
+   unsigned opencode = elts->fix_fetch_opencode & count_mask;
+
+   if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) {
+      uint32_t mask = elts->fix_fetch_unaligned & count_mask;
+      while (mask) {
+         unsigned i = u_bit_scan(&mask);
+         unsigned log_hw_load_size = 1 + ((elts->hw_load_is_dword >> i) & 1);
+         unsigned vbidx = elts->vertex_buffer_index[i];
+         struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbidx];
+         unsigned align_mask = (1 << log_hw_load_size) - 1;
+         if (vb->buffer_offset & align_mask || vb->stride & align_mask) {
+            fix |= 1 << i;
+            opencode |= 1 << i;
+         }
+      }
+   }
+
+   while (fix) {
+      unsigned i = u_bit_scan(&fix);
+      key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i];
+   }
+   key->mono.vs_fetch_opencode = opencode;
  }
  
-/* Compute the key for the hw shader variant */
-static inline void si_shader_selector_key(struct pipe_context *ctx,
-                                         struct si_shader_selector *sel,
-                                         union si_vgt_stages_key stages_key,
-                                         struct si_shader_key *key)
+static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shader_selector *vs,
+                                         struct si_shader_key *key)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-
-       memset(key, 0, sizeof(*key));
-
-       switch (sel->type) {
-       case PIPE_SHADER_VERTEX:
-               si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog);
-
-               if (sctx->tes_shader.cso)
-                       key->as_ls = 1;
-               else if (sctx->gs_shader.cso) {
-                       key->as_es = 1;
-                       key->as_ngg = stages_key.u.ngg;
-               } else {
-                       key->as_ngg = stages_key.u.ngg;
-                       si_shader_selector_key_hw_vs(sctx, sel, key);
-
-                       if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
-                               key->mono.u.vs_export_prim_id = 1;
-               }
-               break;
-       case PIPE_SHADER_TESS_CTRL:
-               if (sctx->chip_class >= GFX9) {
-                       si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
-                                                 key, &key->part.tcs.ls_prolog);
-                       key->part.tcs.ls = sctx->vs_shader.cso;
-
-                       /* When the LS VGPR fix is needed, monolithic shaders
-                        * can:
-                        *  - avoid initializing EXEC in both the LS prolog
-                        *    and the LS main part when !vs_needs_prolog
-                        *  - remove the fixup for unused input VGPRs
-                        */
-                       key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix;
-
-                       /* The LS output / HS input layout can be communicated
-                        * directly instead of via user SGPRs for merged LS-HS.
-                        * The LS VGPR fix prefers this too.
-                        */
-                       key->opt.prefer_mono = 1;
-               }
-
-               key->part.tcs.epilog.prim_mode =
-                       sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
-               key->part.tcs.epilog.invoc0_tess_factors_are_def =
-                       sel->info.tessfactors_are_def_in_all_invocs;
-               key->part.tcs.epilog.tes_reads_tess_factors =
-                       sctx->tes_shader.cso->info.reads_tess_factors;
-
-               if (sel == sctx->fixed_func_tcs_shader.cso)
-                       key->mono.u.ff_tcs_inputs_to_copy = sctx->vs_shader.cso->outputs_written;
-               break;
-       case PIPE_SHADER_TESS_EVAL:
-               key->as_ngg = stages_key.u.ngg;
-
-               if (sctx->gs_shader.cso)
-                       key->as_es = 1;
-               else {
-                       si_shader_selector_key_hw_vs(sctx, sel, key);
-
-                       if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
-                               key->mono.u.vs_export_prim_id = 1;
-               }
-               break;
-       case PIPE_SHADER_GEOMETRY:
-               if (sctx->chip_class >= GFX9) {
-                       if (sctx->tes_shader.cso) {
-                               key->part.gs.es = sctx->tes_shader.cso;
-                       } else {
-                               si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
-                                                         key, &key->part.gs.vs_prolog);
-                               key->part.gs.es = sctx->vs_shader.cso;
-                               key->part.gs.prolog.gfx9_prev_is_vs = 1;
-                       }
-
-                       key->as_ngg = stages_key.u.ngg;
-
-                       /* Merged ES-GS can have unbalanced wave usage.
-                        *
-                        * ES threads are per-vertex, while GS threads are
-                        * per-primitive. So without any amplification, there
-                        * are fewer GS threads than ES threads, which can result
-                        * in empty (no-op) GS waves. With too much amplification,
-                        * there are more GS threads than ES threads, which
-                        * can result in empty (no-op) ES waves.
-                        *
-                        * Non-monolithic shaders are implemented by setting EXEC
-                        * at the beginning of shader parts, and don't jump to
-                        * the end if EXEC is 0.
-                        *
-                        * Monolithic shaders use conditional blocks, so they can
-                        * jump and skip empty waves of ES or GS. So set this to
-                        * always use optimized variants, which are monolithic.
-                        */
-                       key->opt.prefer_mono = 1;
-               }
-               key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;
-               break;
-       case PIPE_SHADER_FRAGMENT: {
-               struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-               struct si_state_blend *blend = sctx->queued.named.blend;
-
-               if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
-                   sel->info.colors_written == 0x1)
-                       key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
-
-               /* Select the shader color format based on whether
-                * blending or alpha are needed.
-                */
-               key->part.ps.epilog.spi_shader_col_format =
-                       (blend->blend_enable_4bit & blend->need_src_alpha_4bit &
-                        sctx->framebuffer.spi_shader_col_format_blend_alpha) |
-                       (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
-                        sctx->framebuffer.spi_shader_col_format_blend) |
-                       (~blend->blend_enable_4bit & blend->need_src_alpha_4bit &
-                        sctx->framebuffer.spi_shader_col_format_alpha) |
-                       (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
-                        sctx->framebuffer.spi_shader_col_format);
-               key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit;
-
-               /* The output for dual source blending should have
-                * the same format as the first output.
-                */
-               if (blend->dual_src_blend) {
-                       key->part.ps.epilog.spi_shader_col_format |=
-                               (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4;
-               }
-
-               /* If alpha-to-coverage is enabled, we have to export alpha
-                * even if there is no color buffer.
-                */
-               if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) &&
-                   blend->alpha_to_coverage)
-                       key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
-
-               /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs
-                * to the range supported by the type if a channel has less
-                * than 16 bits and the export format is 16_ABGR.
-                */
-               if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) {
-                       key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8;
-                       key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10;
-               }
-
-               /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
-               if (!key->part.ps.epilog.last_cbuf) {
-                       key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit;
-                       key->part.ps.epilog.color_is_int8 &= sel->info.colors_written;
-                       key->part.ps.epilog.color_is_int10 &= sel->info.colors_written;
-               }
-
-               bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
-               bool is_line = util_prim_is_lines(sctx->current_rast_prim);
-
-               key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
-               key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read;
-
-               key->part.ps.epilog.alpha_to_one = blend->alpha_to_one &&
-                                                  rs->multisample_enable;
-
-               key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
-               key->part.ps.epilog.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
-                                                          (is_line && rs->line_smooth)) &&
-                                                         sctx->framebuffer.nr_samples <= 1;
-               key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
-
-               if (sctx->ps_iter_samples > 1 &&
-                   sel->info.reads_samplemask) {
-                       key->part.ps.prolog.samplemask_log_ps_iter =
-                               util_logbase2(sctx->ps_iter_samples);
-               }
-
-               if (rs->force_persample_interp &&
-                   rs->multisample_enable &&
-                   sctx->framebuffer.nr_samples > 1 &&
-                   sctx->ps_iter_samples > 1) {
-                       key->part.ps.prolog.force_persp_sample_interp =
-                               sel->info.uses_persp_center ||
-                               sel->info.uses_persp_centroid;
-
-                       key->part.ps.prolog.force_linear_sample_interp =
-                               sel->info.uses_linear_center ||
-                               sel->info.uses_linear_centroid;
-               } else if (rs->multisample_enable &&
-                          sctx->framebuffer.nr_samples > 1) {
-                       key->part.ps.prolog.bc_optimize_for_persp =
-                               sel->info.uses_persp_center &&
-                               sel->info.uses_persp_centroid;
-                       key->part.ps.prolog.bc_optimize_for_linear =
-                               sel->info.uses_linear_center &&
-                               sel->info.uses_linear_centroid;
-               } else {
-                       /* Make sure SPI doesn't compute more than 1 pair
-                        * of (i,j), which is the optimization here. */
-                       key->part.ps.prolog.force_persp_center_interp =
-                               sel->info.uses_persp_center +
-                               sel->info.uses_persp_centroid +
-                               sel->info.uses_persp_sample > 1;
-
-                       key->part.ps.prolog.force_linear_center_interp =
-                               sel->info.uses_linear_center +
-                               sel->info.uses_linear_centroid +
-                               sel->info.uses_linear_sample > 1;
-
-                       if (sel->info.uses_persp_opcode_interp_sample ||
-                           sel->info.uses_linear_opcode_interp_sample)
-                               key->mono.u.ps.interpolate_at_sample_force_center = 1;
-               }
-
-               key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
-
-               /* ps_uses_fbfetch is true only if the color buffer is bound. */
-               if (sctx->ps_uses_fbfetch && !sctx->blitter->running) {
-                       struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
-                       struct pipe_resource *tex = cb0->texture;
-
-                       /* 1D textures are allocated and used as 2D on GFX9. */
-                       key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1;
-                       key->mono.u.ps.fbfetch_is_1D = sctx->chip_class != GFX9 &&
-                                                      (tex->target == PIPE_TEXTURE_1D ||
-                                                       tex->target == PIPE_TEXTURE_1D_ARRAY);
-                       key->mono.u.ps.fbfetch_layered = tex->target == PIPE_TEXTURE_1D_ARRAY ||
-                                                        tex->target == PIPE_TEXTURE_2D_ARRAY ||
-                                                        tex->target == PIPE_TEXTURE_CUBE ||
-                                                        tex->target == PIPE_TEXTURE_CUBE_ARRAY ||
-                                                        tex->target == PIPE_TEXTURE_3D;
-               }
-               break;
-       }
-       default:
-               assert(0);
-       }
-
-       if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT)))
-               memset(&key->opt, 0, sizeof(key->opt));
+   struct si_shader_selector *ps = sctx->ps_shader.cso;
+
+   key->opt.clip_disable = sctx->queued.named.rasterizer->clip_plane_enable == 0 &&
+                           (vs->info.clipdist_writemask || vs->info.writes_clipvertex) &&
+                           !vs->info.culldist_writemask;
+
+   /* Find out if PS is disabled. */
+   bool ps_disabled = true;
+   if (ps) {
+      bool ps_modifies_zs = ps->info.uses_kill || ps->info.writes_z || ps->info.writes_stencil ||
+                            ps->info.writes_samplemask ||
+                            sctx->queued.named.blend->alpha_to_coverage ||
+                            si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS;
+      unsigned ps_colormask = si_get_total_colormask(sctx);
+
+      ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard ||
+                    (!ps_colormask && !ps_modifies_zs && !ps->info.writes_memory);
+   }
+
+   /* Find out which VS outputs aren't used by the PS. */
+   uint64_t outputs_written = vs->outputs_written_before_ps;
+   uint64_t inputs_read = 0;
+
+   /* Ignore outputs that are not passed from VS to PS. */
+   outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0, true)) |
+                        (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0, true)) |
+                        (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_CLIPVERTEX, 0, true)));
+
+   if (!ps_disabled) {
+      inputs_read = ps->inputs_read;
+   }
+
+   uint64_t linked = outputs_written & inputs_read;
+
+   key->opt.kill_outputs = ~linked & outputs_written;
+   key->opt.ngg_culling = sctx->ngg_culling;
  }
  
-static void si_build_shader_variant(struct si_shader *shader,
-                                   int thread_index,
-                                   bool low_priority)
-{
-       struct si_shader_selector *sel = shader->selector;
-       struct si_screen *sscreen = sel->screen;
-       struct ac_llvm_compiler *compiler;
-       struct pipe_debug_callback *debug = &shader->compiler_ctx_state.debug;
-
-       if (thread_index >= 0) {
-               if (low_priority) {
-                       assert(thread_index < ARRAY_SIZE(sscreen->compiler_lowp));
-                       compiler = &sscreen->compiler_lowp[thread_index];
-               } else {
-                       assert(thread_index < ARRAY_SIZE(sscreen->compiler));
-                       compiler = &sscreen->compiler[thread_index];
-               }
-               if (!debug->async)
-                       debug = NULL;
-       } else {
-               assert(!low_priority);
-               compiler = shader->compiler_ctx_state.compiler;
-       }
-
-       if (!compiler->passes)
-               si_init_compiler(sscreen, compiler);
-
-       if (unlikely(!si_create_shader_variant(sscreen, compiler, shader, debug))) {
-               PRINT_ERR("Failed to build shader variant (type=%u)\n",
-                         sel->type);
-               shader->compilation_failed = true;
-               return;
-       }
-
-       if (shader->compiler_ctx_state.is_debug_context) {
-               FILE *f = open_memstream(&shader->shader_log,
-                                        &shader->shader_log_size);
-               if (f) {
-                       si_shader_dump(sscreen, shader, NULL, f, false);
-                       fclose(f);
-               }
-       }
-
-       si_shader_init_pm4_state(sscreen, shader);
+/* Compute the key for the hw shader variant */
+static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel,
+                                          union si_vgt_stages_key stages_key,
+                                          struct si_shader_key *key)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   memset(key, 0, sizeof(*key));
+
+   switch (sel->type) {
+   case PIPE_SHADER_VERTEX:
+      si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog);
+
+      if (sctx->tes_shader.cso)
+         key->as_ls = 1;
+      else if (sctx->gs_shader.cso) {
+         key->as_es = 1;
+         key->as_ngg = stages_key.u.ngg;
+      } else {
+         key->as_ngg = stages_key.u.ngg;
+         si_shader_selector_key_hw_vs(sctx, sel, key);
+
+         if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
+            key->mono.u.vs_export_prim_id = 1;
+      }
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      if (sctx->chip_class >= GFX9) {
+         si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, key, &key->part.tcs.ls_prolog);
+         key->part.tcs.ls = sctx->vs_shader.cso;
+
+         /* When the LS VGPR fix is needed, monolithic shaders
+          * can:
+          *  - avoid initializing EXEC in both the LS prolog
+          *    and the LS main part when !vs_needs_prolog
+          *  - remove the fixup for unused input VGPRs
+          */
+         key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix;
+
+         /* The LS output / HS input layout can be communicated
+          * directly instead of via user SGPRs for merged LS-HS.
+          * The LS VGPR fix prefers this too.
+          */
+         key->opt.prefer_mono = 1;
+      }
+
+      key->part.tcs.epilog.prim_mode =
+         sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+      key->part.tcs.epilog.invoc0_tess_factors_are_def =
+         sel->info.tessfactors_are_def_in_all_invocs;
+      key->part.tcs.epilog.tes_reads_tess_factors = sctx->tes_shader.cso->info.reads_tess_factors;
+
+      if (sel == sctx->fixed_func_tcs_shader.cso)
+         key->mono.u.ff_tcs_inputs_to_copy = sctx->vs_shader.cso->outputs_written;
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      key->as_ngg = stages_key.u.ngg;
+
+      if (sctx->gs_shader.cso)
+         key->as_es = 1;
+      else {
+         si_shader_selector_key_hw_vs(sctx, sel, key);
+
+         if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
+            key->mono.u.vs_export_prim_id = 1;
+      }
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      if (sctx->chip_class >= GFX9) {
+         if (sctx->tes_shader.cso) {
+            key->part.gs.es = sctx->tes_shader.cso;
+         } else {
+            si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, key, &key->part.gs.vs_prolog);
+            key->part.gs.es = sctx->vs_shader.cso;
+            key->part.gs.prolog.gfx9_prev_is_vs = 1;
+         }
+
+         key->as_ngg = stages_key.u.ngg;
+
+         /* Merged ES-GS can have unbalanced wave usage.
+          *
+          * ES threads are per-vertex, while GS threads are
+          * per-primitive. So without any amplification, there
+          * are fewer GS threads than ES threads, which can result
+          * in empty (no-op) GS waves. With too much amplification,
+          * there are more GS threads than ES threads, which
+          * can result in empty (no-op) ES waves.
+          *
+          * Non-monolithic shaders are implemented by setting EXEC
+          * at the beginning of shader parts, and don't jump to
+          * the end if EXEC is 0.
+          *
+          * Monolithic shaders use conditional blocks, so they can
+          * jump and skip empty waves of ES or GS. So set this to
+          * always use optimized variants, which are monolithic.
+          */
+         key->opt.prefer_mono = 1;
+      }
+      key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;
+      break;
+   case PIPE_SHADER_FRAGMENT: {
+      struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+      struct si_state_blend *blend = sctx->queued.named.blend;
+
+      if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
+          sel->info.colors_written == 0x1)
+         key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
+
+      /* Select the shader color format based on whether
+       * blending or alpha are needed.
+       */
+      key->part.ps.epilog.spi_shader_col_format =
+         (blend->blend_enable_4bit & blend->need_src_alpha_4bit &
+          sctx->framebuffer.spi_shader_col_format_blend_alpha) |
+         (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
+          sctx->framebuffer.spi_shader_col_format_blend) |
+         (~blend->blend_enable_4bit & blend->need_src_alpha_4bit &
+          sctx->framebuffer.spi_shader_col_format_alpha) |
+         (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
+          sctx->framebuffer.spi_shader_col_format);
+      key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit;
+
+      /* The output for dual source blending should have
+       * the same format as the first output.
+       */
+      if (blend->dual_src_blend) {
+         key->part.ps.epilog.spi_shader_col_format |=
+            (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4;
+      }
+
+      /* If alpha-to-coverage is enabled, we have to export alpha
+       * even if there is no color buffer.
+       */
+      if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage)
+         key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
+
+      /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs
+       * to the range supported by the type if a channel has less
+       * than 16 bits and the export format is 16_ABGR.
+       */
+      if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) {
+         key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8;
+         key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10;
+      }
+
+      /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
+      if (!key->part.ps.epilog.last_cbuf) {
+         key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit;
+         key->part.ps.epilog.color_is_int8 &= sel->info.colors_written;
+         key->part.ps.epilog.color_is_int10 &= sel->info.colors_written;
+      }
+
+      bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
+      bool is_line = util_prim_is_lines(sctx->current_rast_prim);
+
+      key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
+      key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read;
+
+      key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable;
+
+      key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
+      key->part.ps.epilog.poly_line_smoothing =
+         ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) &&
+         sctx->framebuffer.nr_samples <= 1;
+      key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
+
+      if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask) {
+         key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples);
+      }
+
+      if (rs->force_persample_interp && rs->multisample_enable &&
+          sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) {
+         key->part.ps.prolog.force_persp_sample_interp =
+            sel->info.uses_persp_center || sel->info.uses_persp_centroid;
+
+         key->part.ps.prolog.force_linear_sample_interp =
+            sel->info.uses_linear_center || sel->info.uses_linear_centroid;
+      } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) {
+         key->part.ps.prolog.bc_optimize_for_persp =
+            sel->info.uses_persp_center && sel->info.uses_persp_centroid;
+         key->part.ps.prolog.bc_optimize_for_linear =
+            sel->info.uses_linear_center && sel->info.uses_linear_centroid;
+      } else {
+         /* Make sure SPI doesn't compute more than 1 pair
+          * of (i,j), which is the optimization here. */
+         key->part.ps.prolog.force_persp_center_interp = sel->info.uses_persp_center +
+                                                            sel->info.uses_persp_centroid +
+                                                            sel->info.uses_persp_sample >
+                                                         1;
+
+         key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center +
+                                                             sel->info.uses_linear_centroid +
+                                                             sel->info.uses_linear_sample >
+                                                          1;
+
+         if (sel->info.uses_persp_opcode_interp_sample ||
+             sel->info.uses_linear_opcode_interp_sample)
+            key->mono.u.ps.interpolate_at_sample_force_center = 1;
+      }
+
+      key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
+
+      /* ps_uses_fbfetch is true only if the color buffer is bound. */
+      if (sctx->ps_uses_fbfetch && !sctx->blitter->running) {
+         struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
+         struct pipe_resource *tex = cb0->texture;
+
+         /* 1D textures are allocated and used as 2D on GFX9. */
+         key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1;
+         key->mono.u.ps.fbfetch_is_1D =
+            sctx->chip_class != GFX9 &&
+            (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY);
+         key->mono.u.ps.fbfetch_layered =
+            tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY ||
+            tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY ||
+            tex->target == PIPE_TEXTURE_3D;
+      }
+      break;
+   }
+   default:
+      assert(0);
+   }
+
+   if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT)))
+      memset(&key->opt, 0, sizeof(key->opt));
+}
+
+static void si_build_shader_variant(struct si_shader *shader, int thread_index, bool low_priority)
+{
+   struct si_shader_selector *sel = shader->selector;
+   struct si_screen *sscreen = sel->screen;
+   struct ac_llvm_compiler *compiler;
+   struct pipe_debug_callback *debug = &shader->compiler_ctx_state.debug;
+
+   if (thread_index >= 0) {
+      if (low_priority) {
+         assert(thread_index < ARRAY_SIZE(sscreen->compiler_lowp));
+         compiler = &sscreen->compiler_lowp[thread_index];
+      } else {
+         assert(thread_index < ARRAY_SIZE(sscreen->compiler));
+         compiler = &sscreen->compiler[thread_index];
+      }
+      if (!debug->async)
+         debug = NULL;
+   } else {
+      assert(!low_priority);
+      compiler = shader->compiler_ctx_state.compiler;
+   }
+
+   if (!compiler->passes)
+      si_init_compiler(sscreen, compiler);
+
+   if (unlikely(!si_create_shader_variant(sscreen, compiler, shader, debug))) {
+      PRINT_ERR("Failed to build shader variant (type=%u)\n", sel->type);
+      shader->compilation_failed = true;
+      return;
+   }
+
+   if (shader->compiler_ctx_state.is_debug_context) {
+      FILE *f = open_memstream(&shader->shader_log, &shader->shader_log_size);
+      if (f) {
+         si_shader_dump(sscreen, shader, NULL, f, false);
+         fclose(f);
+      }
+   }
+
+   si_shader_init_pm4_state(sscreen, shader);
  }
  
  static void si_build_shader_variant_low_priority(void *job, int thread_index)
  {
-       struct si_shader *shader = (struct si_shader *)job;
+   struct si_shader *shader = (struct si_shader *)job;
  
-       assert(thread_index >= 0);
+   assert(thread_index >= 0);
  
-       si_build_shader_variant(shader, thread_index, true);
+   si_build_shader_variant(shader, thread_index, true);
  }
  
  static const struct si_shader_key zeroed;
  
-static bool si_check_missing_main_part(struct si_screen *sscreen,
-                                      struct si_shader_selector *sel,
-                                      struct si_compiler_ctx_state *compiler_state,
-                                      struct si_shader_key *key)
+static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shader_selector *sel,
+                                       struct si_compiler_ctx_state *compiler_state,
+                                       struct si_shader_key *key)
  {
-       struct si_shader **mainp = si_get_main_shader_part(sel, key);
-
-       if (!*mainp) {
-               struct si_shader *main_part = CALLOC_STRUCT(si_shader);
-
-               if (!main_part)
-                       return false;
-
-               /* We can leave the fence as permanently signaled because the
-                * main part becomes visible globally only after it has been
-                * compiled. */
-               util_queue_fence_init(&main_part->ready);
-
-               main_part->selector = sel;
-               main_part->key.as_es = key->as_es;
-               main_part->key.as_ls = key->as_ls;
-               main_part->key.as_ngg = key->as_ngg;
-               main_part->is_monolithic = false;
-
-               if (!si_compile_shader(sscreen, compiler_state->compiler,
-                                      main_part, &compiler_state->debug)) {
-                       FREE(main_part);
-                       return false;
-               }
-               *mainp = main_part;
-       }
-       return true;
+   struct si_shader **mainp = si_get_main_shader_part(sel, key);
+
+   if (!*mainp) {
+      struct si_shader *main_part = CALLOC_STRUCT(si_shader);
+
+      if (!main_part)
+         return false;
+
+      /* We can leave the fence as permanently signaled because the
+       * main part becomes visible globally only after it has been
+       * compiled. */
+      util_queue_fence_init(&main_part->ready);
+
+      main_part->selector = sel;
+      main_part->key.as_es = key->as_es;
+      main_part->key.as_ls = key->as_ls;
+      main_part->key.as_ngg = key->as_ngg;
+      main_part->is_monolithic = false;
+
+      if (!si_compile_shader(sscreen, compiler_state->compiler, main_part,
+                             &compiler_state->debug)) {
+         FREE(main_part);
+         return false;
+      }
+      *mainp = main_part;
+   }
+   return true;
  }
  
  /**
@@ -2277,283 +2106,264 @@ static bool si_check_missing_main_part(struct si_screen *sscreen,
   *                           the compilation isn't finished, don't select any
   *                           shader and return an error.
   */
-int si_shader_select_with_key(struct si_screen *sscreen,
-                             struct si_shader_ctx_state *state,
-                             struct si_compiler_ctx_state *compiler_state,
-                             struct si_shader_key *key,
-                             int thread_index,
-                             bool optimized_or_none)
+int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state,
+                              struct si_compiler_ctx_state *compiler_state,
+                              struct si_shader_key *key, int thread_index, bool optimized_or_none)
  {
-       struct si_shader_selector *sel = state->cso;
-       struct si_shader_selector *previous_stage_sel = NULL;
-       struct si_shader *current = state->current;
-       struct si_shader *iter, *shader = NULL;
+   struct si_shader_selector *sel = state->cso;
+   struct si_shader_selector *previous_stage_sel = NULL;
+   struct si_shader *current = state->current;
+   struct si_shader *iter, *shader = NULL;
  
  again:
-       /* Check if we don't need to change anything.
-        * This path is also used for most shaders that don't need multiple
-        * variants, it will cost just a computation of the key and this
-        * test. */
-       if (likely(current &&
-                  memcmp(&current->key, key, sizeof(*key)) == 0)) {
-               if (unlikely(!util_queue_fence_is_signalled(&current->ready))) {
-                       if (current->is_optimized) {
-                               if (optimized_or_none)
-                                       return -1;
-
-                               memset(&key->opt, 0, sizeof(key->opt));
-                               goto current_not_ready;
-                       }
-
-                       util_queue_fence_wait(&current->ready);
-               }
-
-               return current->compilation_failed ? -1 : 0;
-       }
+   /* Check if we don't need to change anything.
+    * This path is also used for most shaders that don't need multiple
+    * variants, it will cost just a computation of the key and this
+    * test. */
+   if (likely(current && memcmp(&current->key, key, sizeof(*key)) == 0)) {
+      if (unlikely(!util_queue_fence_is_signalled(&current->ready))) {
+         if (current->is_optimized) {
+            if (optimized_or_none)
+               return -1;
+
+            memset(&key->opt, 0, sizeof(key->opt));
+            goto current_not_ready;
+         }
+
+         util_queue_fence_wait(&current->ready);
+      }
+
+      return current->compilation_failed ? -1 : 0;
+   }
  current_not_ready:
  
-       /* This must be done before the mutex is locked, because async GS
-        * compilation calls this function too, and therefore must enter
-        * the mutex first.
-        *
-        * Only wait if we are in a draw call. Don't wait if we are
-        * in a compiler thread.
-        */
-       if (thread_index < 0)
-               util_queue_fence_wait(&sel->ready);
-
-       simple_mtx_lock(&sel->mutex);
-
-       /* Find the shader variant. */
-       for (iter = sel->first_variant; iter; iter = iter->next_variant) {
-               /* Don't check the "current" shader. We checked it above. */
-               if (current != iter &&
-                   memcmp(&iter->key, key, sizeof(*key)) == 0) {
-                       simple_mtx_unlock(&sel->mutex);
-
-                       if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) {
-                               /* If it's an optimized shader and its compilation has
-                                * been started but isn't done, use the unoptimized
-                                * shader so as not to cause a stall due to compilation.
-                                */
-                               if (iter->is_optimized) {
-                                       if (optimized_or_none)
-                                               return -1;
-                                       memset(&key->opt, 0, sizeof(key->opt));
-                                       goto again;
-                               }
-
-                               util_queue_fence_wait(&iter->ready);
-                       }
-
-                       if (iter->compilation_failed) {
-                               return -1; /* skip the draw call */
-                       }
-
-                       state->current = iter;
-                       return 0;
-               }
-       }
-
-       /* Build a new shader. */
-       shader = CALLOC_STRUCT(si_shader);
-       if (!shader) {
-               simple_mtx_unlock(&sel->mutex);
-               return -ENOMEM;
-       }
-
-       util_queue_fence_init(&shader->ready);
-
-       shader->selector = sel;
-       shader->key = *key;
-       shader->compiler_ctx_state = *compiler_state;
-
-       /* If this is a merged shader, get the first shader's selector. */
-       if (sscreen->info.chip_class >= GFX9) {
-               if (sel->type == PIPE_SHADER_TESS_CTRL)
-                       previous_stage_sel = key->part.tcs.ls;
-               else if (sel->type == PIPE_SHADER_GEOMETRY)
-                       previous_stage_sel = key->part.gs.es;
-
-               /* We need to wait for the previous shader. */
-               if (previous_stage_sel && thread_index < 0)
-                       util_queue_fence_wait(&previous_stage_sel->ready);
-       }
-
-       bool is_pure_monolithic =
-               sscreen->use_monolithic_shaders ||
-               memcmp(&key->mono, &zeroed.mono, sizeof(key->mono)) != 0;
-
-       /* Compile the main shader part if it doesn't exist. This can happen
-        * if the initial guess was wrong.
-        *
-        * The prim discard CS doesn't need the main shader part.
-        */
-       if (!is_pure_monolithic &&
-           !key->opt.vs_as_prim_discard_cs) {
-               bool ok = true;
-
-               /* Make sure the main shader part is present. This is needed
-                * for shaders that can be compiled as VS, LS, or ES, and only
-                * one of them is compiled at creation.
-                *
-                * It is also needed for GS, which can be compiled as non-NGG
-                * and NGG.
-                *
-                * For merged shaders, check that the starting shader's main
-                * part is present.
-                */
-               if (previous_stage_sel) {
-                       struct si_shader_key shader1_key = zeroed;
-
-                       if (sel->type == PIPE_SHADER_TESS_CTRL) {
-                               shader1_key.as_ls = 1;
-                       } else if (sel->type == PIPE_SHADER_GEOMETRY) {
-                               shader1_key.as_es = 1;
-                               shader1_key.as_ngg = key->as_ngg; /* for Wave32 vs Wave64 */
-                       } else {
-                               assert(0);
-                       }
-
-                       simple_mtx_lock(&previous_stage_sel->mutex);
-                       ok = si_check_missing_main_part(sscreen,
-                                                       previous_stage_sel,
-                                                       compiler_state, &shader1_key);
-                       simple_mtx_unlock(&previous_stage_sel->mutex);
-               }
-
-               if (ok) {
-                       ok = si_check_missing_main_part(sscreen, sel,
-                                                       compiler_state, key);
-               }
-
-               if (!ok) {
-                       FREE(shader);
-                       simple_mtx_unlock(&sel->mutex);
-                       return -ENOMEM; /* skip the draw call */
-               }
-       }
-
-       /* Keep the reference to the 1st shader of merged shaders, so that
-        * Gallium can't destroy it before we destroy the 2nd shader.
-        *
-        * Set sctx = NULL, because it's unused if we're not releasing
-        * the shader, and we don't have any sctx here.
-        */
-       si_shader_selector_reference(NULL, &shader->previous_stage_sel,
-                                    previous_stage_sel);
-
-       /* Monolithic-only shaders don't make a distinction between optimized
-        * and unoptimized. */
-       shader->is_monolithic =
-               is_pure_monolithic ||
-               memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
-
-       /* The prim discard CS is always optimized. */
-       shader->is_optimized =
-               (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
-                memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
-
-       /* If it's an optimized shader, compile it asynchronously. */
-       if (shader->is_optimized && thread_index < 0) {
-               /* Compile it asynchronously. */
-               util_queue_add_job(&sscreen->shader_compiler_queue_low_priority,
-                                  shader, &shader->ready,
-                                  si_build_shader_variant_low_priority, NULL,
-                                  0);
-
-               /* Add only after the ready fence was reset, to guard against a
-                * race with si_bind_XX_shader. */
-               if (!sel->last_variant) {
-                       sel->first_variant = shader;
-                       sel->last_variant = shader;
-               } else {
-                       sel->last_variant->next_variant = shader;
-                       sel->last_variant = shader;
-               }
-
-               /* Use the default (unoptimized) shader for now. */
-               memset(&key->opt, 0, sizeof(key->opt));
-               simple_mtx_unlock(&sel->mutex);
-
-               if (sscreen->options.sync_compile)
-                       util_queue_fence_wait(&shader->ready);
-
-               if (optimized_or_none)
-                       return -1;
-               goto again;
-       }
-
-       /* Reset the fence before adding to the variant list. */
-       util_queue_fence_reset(&shader->ready);
-
-       if (!sel->last_variant) {
-               sel->first_variant = shader;
-               sel->last_variant = shader;
-       } else {
-               sel->last_variant->next_variant = shader;
-               sel->last_variant = shader;
-       }
-
-       simple_mtx_unlock(&sel->mutex);
-
-       assert(!shader->is_optimized);
-       si_build_shader_variant(shader, thread_index, false);
-
-       util_queue_fence_signal(&shader->ready);
-
-       if (!shader->compilation_failed)
-               state->current = shader;
-
-       return shader->compilation_failed ? -1 : 0;
-}
-
-static int si_shader_select(struct pipe_context *ctx,
-                           struct si_shader_ctx_state *state,
-                           union si_vgt_stages_key stages_key,
-                           struct si_compiler_ctx_state *compiler_state)
-{
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_shader_key key;
-
-       si_shader_selector_key(ctx, state->cso, stages_key, &key);
-       return si_shader_select_with_key(sctx->screen, state, compiler_state,
-                                        &key, -1, false);
-}
-
-static void si_parse_next_shader_property(const struct si_shader_info *info,
-                                         bool streamout,
-                                         struct si_shader_key *key)
-{
-       unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER];
-
-       switch (info->processor) {
-       case PIPE_SHADER_VERTEX:
-               switch (next_shader) {
-               case PIPE_SHADER_GEOMETRY:
-                       key->as_es = 1;
-                       break;
-               case PIPE_SHADER_TESS_CTRL:
-               case PIPE_SHADER_TESS_EVAL:
-                       key->as_ls = 1;
-                       break;
-               default:
-                       /* If POSITION isn't written, it can only be a HW VS
-                        * if streamout is used. If streamout isn't used,
-                        * assume that it's a HW LS. (the next shader is TCS)
-                        * This heuristic is needed for separate shader objects.
-                        */
-                       if (!info->writes_position && !streamout)
-                               key->as_ls = 1;
-               }
-               break;
-
-       case PIPE_SHADER_TESS_EVAL:
-               if (next_shader == PIPE_SHADER_GEOMETRY ||
-                   !info->writes_position)
-                       key->as_es = 1;
-               break;
-       }
+   /* This must be done before the mutex is locked, because async GS
+    * compilation calls this function too, and therefore must enter
+    * the mutex first.
+    *
+    * Only wait if we are in a draw call. Don't wait if we are
+    * in a compiler thread.
+    */
+   if (thread_index < 0)
+      util_queue_fence_wait(&sel->ready);
+
+   simple_mtx_lock(&sel->mutex);
+
+   /* Find the shader variant. */
+   for (iter = sel->first_variant; iter; iter = iter->next_variant) {
+      /* Don't check the "current" shader. We checked it above. */
+      if (current != iter && memcmp(&iter->key, key, sizeof(*key)) == 0) {
+         simple_mtx_unlock(&sel->mutex);
+
+         if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) {
+            /* If it's an optimized shader and its compilation has
+             * been started but isn't done, use the unoptimized
+             * shader so as not to cause a stall due to compilation.
+             */
+            if (iter->is_optimized) {
+               if (optimized_or_none)
+                  return -1;
+               memset(&key->opt, 0, sizeof(key->opt));
+               goto again;
+            }
+
+            util_queue_fence_wait(&iter->ready);
+         }
+
+         if (iter->compilation_failed) {
+            return -1; /* skip the draw call */
+         }
+
+         state->current = iter;
+         return 0;
+      }
+   }
+
+   /* Build a new shader. */
+   shader = CALLOC_STRUCT(si_shader);
+   if (!shader) {
+      simple_mtx_unlock(&sel->mutex);
+      return -ENOMEM;
+   }
+
+   util_queue_fence_init(&shader->ready);
+
+   shader->selector = sel;
+   shader->key = *key;
+   shader->compiler_ctx_state = *compiler_state;
+
+   /* If this is a merged shader, get the first shader's selector. */
+   if (sscreen->info.chip_class >= GFX9) {
+      if (sel->type == PIPE_SHADER_TESS_CTRL)
+         previous_stage_sel = key->part.tcs.ls;
+      else if (sel->type == PIPE_SHADER_GEOMETRY)
+         previous_stage_sel = key->part.gs.es;
+
+      /* We need to wait for the previous shader. */
+      if (previous_stage_sel && thread_index < 0)
+         util_queue_fence_wait(&previous_stage_sel->ready);
+   }
+
+   bool is_pure_monolithic =
+      sscreen->use_monolithic_shaders || memcmp(&key->mono, &zeroed.mono, sizeof(key->mono)) != 0;
+
+   /* Compile the main shader part if it doesn't exist. This can happen
+    * if the initial guess was wrong.
+    *
+    * The prim discard CS doesn't need the main shader part.
+    */
+   if (!is_pure_monolithic && !key->opt.vs_as_prim_discard_cs) {
+      bool ok = true;
+
+      /* Make sure the main shader part is present. This is needed
+       * for shaders that can be compiled as VS, LS, or ES, and only
+       * one of them is compiled at creation.
+       *
+       * It is also needed for GS, which can be compiled as non-NGG
+       * and NGG.
+       *
+       * For merged shaders, check that the starting shader's main
+       * part is present.
+       */
+      if (previous_stage_sel) {
+         struct si_shader_key shader1_key = zeroed;
+
+         if (sel->type == PIPE_SHADER_TESS_CTRL) {
+            shader1_key.as_ls = 1;
+         } else if (sel->type == PIPE_SHADER_GEOMETRY) {
+            shader1_key.as_es = 1;
+            shader1_key.as_ngg = key->as_ngg; /* for Wave32 vs Wave64 */
+         } else {
+            assert(0);
+         }
+
+         simple_mtx_lock(&previous_stage_sel->mutex);
+         ok = si_check_missing_main_part(sscreen, previous_stage_sel, compiler_state, &shader1_key);
+         simple_mtx_unlock(&previous_stage_sel->mutex);
+      }
+
+      if (ok) {
+         ok = si_check_missing_main_part(sscreen, sel, compiler_state, key);
+      }
+
+      if (!ok) {
+         FREE(shader);
+         simple_mtx_unlock(&sel->mutex);
+         return -ENOMEM; /* skip the draw call */
+      }
+   }
+
+   /* Keep the reference to the 1st shader of merged shaders, so that
+    * Gallium can't destroy it before we destroy the 2nd shader.
+    *
+    * Set sctx = NULL, because it's unused if we're not releasing
+    * the shader, and we don't have any sctx here.
+    */
+   si_shader_selector_reference(NULL, &shader->previous_stage_sel, previous_stage_sel);
+
+   /* Monolithic-only shaders don't make a distinction between optimized
+    * and unoptimized. */
+   shader->is_monolithic =
+      is_pure_monolithic || memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
+
+   /* The prim discard CS is always optimized. */
+   shader->is_optimized = (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
+                          memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
+
+   /* If it's an optimized shader, compile it asynchronously. */
+   if (shader->is_optimized && thread_index < 0) {
+      /* Compile it asynchronously. */
+      util_queue_add_job(&sscreen->shader_compiler_queue_low_priority, shader, &shader->ready,
+                         si_build_shader_variant_low_priority, NULL, 0);
+
+      /* Add only after the ready fence was reset, to guard against a
+       * race with si_bind_XX_shader. */
+      if (!sel->last_variant) {
+         sel->first_variant = shader;
+         sel->last_variant = shader;
+      } else {
+         sel->last_variant->next_variant = shader;
+         sel->last_variant = shader;
+      }
+
+      /* Use the default (unoptimized) shader for now. */
+      memset(&key->opt, 0, sizeof(key->opt));
+      simple_mtx_unlock(&sel->mutex);
+
+      if (sscreen->options.sync_compile)
+         util_queue_fence_wait(&shader->ready);
+
+      if (optimized_or_none)
+         return -1;
+      goto again;
+   }
+
+   /* Reset the fence before adding to the variant list. */
+   util_queue_fence_reset(&shader->ready);
+
+   if (!sel->last_variant) {
+      sel->first_variant = shader;
+      sel->last_variant = shader;
+   } else {
+      sel->last_variant->next_variant = shader;
+      sel->last_variant = shader;
+   }
+
+   simple_mtx_unlock(&sel->mutex);
+
+   assert(!shader->is_optimized);
+   si_build_shader_variant(shader, thread_index, false);
+
+   util_queue_fence_signal(&shader->ready);
+
+   if (!shader->compilation_failed)
+      state->current = shader;
+
+   return shader->compilation_failed ? -1 : 0;
+}
+
+static int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state,
+                            union si_vgt_stages_key stages_key,
+                            struct si_compiler_ctx_state *compiler_state)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_key key;
+
+   si_shader_selector_key(ctx, state->cso, stages_key, &key);
+   return si_shader_select_with_key(sctx->screen, state, compiler_state, &key, -1, false);
+}
+
+static void si_parse_next_shader_property(const struct si_shader_info *info, bool streamout,
+                                          struct si_shader_key *key)
+{
+   unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER];
+
+   switch (info->processor) {
+   case PIPE_SHADER_VERTEX:
+      switch (next_shader) {
+      case PIPE_SHADER_GEOMETRY:
+         key->as_es = 1;
+         break;
+      case PIPE_SHADER_TESS_CTRL:
+      case PIPE_SHADER_TESS_EVAL:
+         key->as_ls = 1;
+         break;
+      default:
+         /* If POSITION isn't written, it can only be a HW VS
+          * if streamout is used. If streamout isn't used,
+          * assume that it's a HW LS. (the next shader is TCS)
+          * This heuristic is needed for separate shader objects.
+          */
+         if (!info->writes_position && !streamout)
+            key->as_ls = 1;
+      }
+      break;
+
+   case PIPE_SHADER_TESS_EVAL:
+      if (next_shader == PIPE_SHADER_GEOMETRY || !info->writes_position)
+         key->as_es = 1;
+      break;
+   }
  }
  
  /**
@@ -2563,971 +2373,904 @@ static void si_parse_next_shader_property(const struct si_shader_info *info,
   */
  static void si_init_shader_selector_async(void *job, int thread_index)
  {
-       struct si_shader_selector *sel = (struct si_shader_selector *)job;
-       struct si_screen *sscreen = sel->screen;
-       struct ac_llvm_compiler *compiler;
-       struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
-
-       assert(!debug->debug_message || debug->async);
-       assert(thread_index >= 0);
-       assert(thread_index < ARRAY_SIZE(sscreen->compiler));
-       compiler = &sscreen->compiler[thread_index];
-
-       if (!compiler->passes)
-               si_init_compiler(sscreen, compiler);
-
-       /* Serialize NIR to save memory. Monolithic shader variants
-        * have to deserialize NIR before compilation.
-        */
-       if (sel->nir) {
-               struct blob blob;
-                size_t size;
-
-               blob_init(&blob);
-               /* true = remove optional debugging data to increase
-                * the likehood of getting more shader cache hits.
-                * It also drops variable names, so we'll save more memory.
-                */
-               nir_serialize(&blob, sel->nir, true);
-               blob_finish_get_buffer(&blob, &sel->nir_binary, &size);
-               sel->nir_size = size;
-       }
-
-       /* Compile the main shader part for use with a prolog and/or epilog.
-        * If this fails, the driver will try to compile a monolithic shader
-        * on demand.
-        */
-       if (!sscreen->use_monolithic_shaders) {
-               struct si_shader *shader = CALLOC_STRUCT(si_shader);
-               unsigned char ir_sha1_cache_key[20];
-
-               if (!shader) {
-                       fprintf(stderr, "radeonsi: can't allocate a main shader part\n");
-                       return;
-               }
-
-               /* We can leave the fence signaled because use of the default
-                * main part is guarded by the selector's ready fence. */
-               util_queue_fence_init(&shader->ready);
-
-               shader->selector = sel;
-               shader->is_monolithic = false;
-               si_parse_next_shader_property(&sel->info,
-                                             sel->so.num_outputs != 0,
-                                             &shader->key);
-
-               if (sscreen->use_ngg &&
-                   (!sel->so.num_outputs || sscreen->use_ngg_streamout) &&
-                   ((sel->type == PIPE_SHADER_VERTEX && !shader->key.as_ls) ||
-                    sel->type == PIPE_SHADER_TESS_EVAL ||
-                    sel->type == PIPE_SHADER_GEOMETRY))
-                       shader->key.as_ngg = 1;
-
-               if (sel->nir) {
-                       si_get_ir_cache_key(sel, shader->key.as_ngg,
-                                           shader->key.as_es, ir_sha1_cache_key);
-               }
-
-               /* Try to load the shader from the shader cache. */
-               simple_mtx_lock(&sscreen->shader_cache_mutex);
-
-               if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
-                       simple_mtx_unlock(&sscreen->shader_cache_mutex);
-                       si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
-               } else {
-                       simple_mtx_unlock(&sscreen->shader_cache_mutex);
-
-                       /* Compile the shader if it hasn't been loaded from the cache. */
-                       if (!si_compile_shader(sscreen, compiler, shader, debug)) {
-                               FREE(shader);
-                               fprintf(stderr, "radeonsi: can't compile a main shader part\n");
-                               return;
-                       }
-
-                       simple_mtx_lock(&sscreen->shader_cache_mutex);
-                       si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key,
-                                                     shader, true);
-                       simple_mtx_unlock(&sscreen->shader_cache_mutex);
-               }
-
-               *si_get_main_shader_part(sel, &shader->key) = shader;
-
-               /* Unset "outputs_written" flags for outputs converted to
-                * DEFAULT_VAL, so that later inter-shader optimizations don't
-                * try to eliminate outputs that don't exist in the final
-                * shader.
-                *
-                * This is only done if non-monolithic shaders are enabled.
-                */
-               if ((sel->type == PIPE_SHADER_VERTEX ||
-                    sel->type == PIPE_SHADER_TESS_EVAL) &&
-                   !shader->key.as_ls &&
-                   !shader->key.as_es) {
-                       unsigned i;
-
-                       for (i = 0; i < sel->info.num_outputs; i++) {
-                               unsigned offset = shader->info.vs_output_param_offset[i];
-
-                               if (offset <= AC_EXP_PARAM_OFFSET_31)
-                                       continue;
-
-                               unsigned name = sel->info.output_semantic_name[i];
-                               unsigned index = sel->info.output_semantic_index[i];
-                               unsigned id;
-
-                               switch (name) {
-                               case TGSI_SEMANTIC_GENERIC:
-                                       /* don't process indices the function can't handle */
-                                       if (index >= SI_MAX_IO_GENERIC)
-                                               break;
-                                       /* fall through */
-                               default:
-                                       id = si_shader_io_get_unique_index(name, index, true);
-                                       sel->outputs_written_before_ps &= ~(1ull << id);
-                                       break;
-                               case TGSI_SEMANTIC_POSITION: /* ignore these */
-                               case TGSI_SEMANTIC_PSIZE:
-                               case TGSI_SEMANTIC_CLIPVERTEX:
-                               case TGSI_SEMANTIC_EDGEFLAG:
-                                       break;
-                               }
-                       }
-               }
-       }
-
-       /* The GS copy shader is always pre-compiled. */
-       if (sel->type == PIPE_SHADER_GEOMETRY &&
-           (!sscreen->use_ngg ||
-            !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */
-            sel->tess_turns_off_ngg)) {
-               sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
-               if (!sel->gs_copy_shader) {
-                       fprintf(stderr, "radeonsi: can't create GS copy shader\n");
-                       return;
-               }
-
-               si_shader_vs(sscreen, sel->gs_copy_shader, sel);
-       }
-
-       /* Free NIR. We only keep serialized NIR after this point. */
-       if (sel->nir) {
-               ralloc_free(sel->nir);
-               sel->nir = NULL;
-       }
+   struct si_shader_selector *sel = (struct si_shader_selector *)job;
+   struct si_screen *sscreen = sel->screen;
+   struct ac_llvm_compiler *compiler;
+   struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
+
+   assert(!debug->debug_message || debug->async);
+   assert(thread_index >= 0);
+   assert(thread_index < ARRAY_SIZE(sscreen->compiler));
+   compiler = &sscreen->compiler[thread_index];
+
+   if (!compiler->passes)
+      si_init_compiler(sscreen, compiler);
+
+   /* Serialize NIR to save memory. Monolithic shader variants
+    * have to deserialize NIR before compilation.
+    */
+   if (sel->nir) {
+      struct blob blob;
+      size_t size;
+
+      blob_init(&blob);
+      /* true = remove optional debugging data to increase
+       * the likehood of getting more shader cache hits.
+       * It also drops variable names, so we'll save more memory.
+       */
+      nir_serialize(&blob, sel->nir, true);
+      blob_finish_get_buffer(&blob, &sel->nir_binary, &size);
+      sel->nir_size = size;
+   }
+
+   /* Compile the main shader part for use with a prolog and/or epilog.
+    * If this fails, the driver will try to compile a monolithic shader
+    * on demand.
+    */
+   if (!sscreen->use_monolithic_shaders) {
+      struct si_shader *shader = CALLOC_STRUCT(si_shader);
+      unsigned char ir_sha1_cache_key[20];
+
+      if (!shader) {
+         fprintf(stderr, "radeonsi: can't allocate a main shader part\n");
+         return;
+      }
+
+      /* We can leave the fence signaled because use of the default
+       * main part is guarded by the selector's ready fence. */
+      util_queue_fence_init(&shader->ready);
+
+      shader->selector = sel;
+      shader->is_monolithic = false;
+      si_parse_next_shader_property(&sel->info, sel->so.num_outputs != 0, &shader->key);
+
+      if (sscreen->use_ngg && (!sel->so.num_outputs || sscreen->use_ngg_streamout) &&
+          ((sel->type == PIPE_SHADER_VERTEX && !shader->key.as_ls) ||
+           sel->type == PIPE_SHADER_TESS_EVAL || sel->type == PIPE_SHADER_GEOMETRY))
+         shader->key.as_ngg = 1;
+
+      if (sel->nir) {
+         si_get_ir_cache_key(sel, shader->key.as_ngg, shader->key.as_es, ir_sha1_cache_key);
+      }
+
+      /* Try to load the shader from the shader cache. */
+      simple_mtx_lock(&sscreen->shader_cache_mutex);
+
+      if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
+         simple_mtx_unlock(&sscreen->shader_cache_mutex);
+         si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
+      } else {
+         simple_mtx_unlock(&sscreen->shader_cache_mutex);
+
+         /* Compile the shader if it hasn't been loaded from the cache. */
+         if (!si_compile_shader(sscreen, compiler, shader, debug)) {
+            FREE(shader);
+            fprintf(stderr, "radeonsi: can't compile a main shader part\n");
+            return;
+         }
+
+         simple_mtx_lock(&sscreen->shader_cache_mutex);
+         si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, true);
+         simple_mtx_unlock(&sscreen->shader_cache_mutex);
+      }
+
+      *si_get_main_shader_part(sel, &shader->key) = shader;
+
+      /* Unset "outputs_written" flags for outputs converted to
+       * DEFAULT_VAL, so that later inter-shader optimizations don't
+       * try to eliminate outputs that don't exist in the final
+       * shader.
+       *
+       * This is only done if non-monolithic shaders are enabled.
+       */
+      if ((sel->type == PIPE_SHADER_VERTEX || sel->type == PIPE_SHADER_TESS_EVAL) &&
+          !shader->key.as_ls && !shader->key.as_es) {
+         unsigned i;
+
+         for (i = 0; i < sel->info.num_outputs; i++) {
+            unsigned offset = shader->info.vs_output_param_offset[i];
+
+            if (offset <= AC_EXP_PARAM_OFFSET_31)
+               continue;
+
+            unsigned name = sel->info.output_semantic_name[i];
+            unsigned index = sel->info.output_semantic_index[i];
+            unsigned id;
+
+            switch (name) {
+            case TGSI_SEMANTIC_GENERIC:
+               /* don't process indices the function can't handle */
+               if (index >= SI_MAX_IO_GENERIC)
+                  break;
+               /* fall through */
+            default:
+               id = si_shader_io_get_unique_index(name, index, true);
+               sel->outputs_written_before_ps &= ~(1ull << id);
+               break;
+            case TGSI_SEMANTIC_POSITION: /* ignore these */
+            case TGSI_SEMANTIC_PSIZE:
+            case TGSI_SEMANTIC_CLIPVERTEX:
+            case TGSI_SEMANTIC_EDGEFLAG:
+               break;
+            }
+         }
+      }
+   }
+
+   /* The GS copy shader is always pre-compiled. */
+   if (sel->type == PIPE_SHADER_GEOMETRY &&
+       (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */
+        sel->tess_turns_off_ngg)) {
+      sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
+      if (!sel->gs_copy_shader) {
+         fprintf(stderr, "radeonsi: can't create GS copy shader\n");
+         return;
+      }
+
+      si_shader_vs(sscreen, sel->gs_copy_shader, sel);
+   }
+
+   /* Free NIR. We only keep serialized NIR after this point. */
+   if (sel->nir) {
+      ralloc_free(sel->nir);
+      sel->nir = NULL;
+   }
  }
  
  void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
-                                struct util_queue_fence *ready_fence,
-                                struct si_compiler_ctx_state *compiler_ctx_state,
-                                void *job, util_queue_execute_func execute)
+                                 struct util_queue_fence *ready_fence,
+                                 struct si_compiler_ctx_state *compiler_ctx_state, void *job,
+                                 util_queue_execute_func execute)
  {
-       util_queue_fence_init(ready_fence);
-
-       struct util_async_debug_callback async_debug;
-       bool debug =
-               (sctx->debug.debug_message && !sctx->debug.async) ||
-               sctx->is_debug ||
-               si_can_dump_shader(sctx->screen, processor);
-
-       if (debug) {
-               u_async_debug_init(&async_debug);
-               compiler_ctx_state->debug = async_debug.base;
-       }
-
-       util_queue_add_job(&sctx->screen->shader_compiler_queue, job,
-                          ready_fence, execute, NULL, 0);
-
-       if (debug) {
-               util_queue_fence_wait(ready_fence);
-               u_async_debug_drain(&async_debug, &sctx->debug);
-               u_async_debug_cleanup(&async_debug);
-       }
-
-       if (sctx->screen->options.sync_compile)
-               util_queue_fence_wait(ready_fence);
+   util_queue_fence_init(ready_fence);
+
+   struct util_async_debug_callback async_debug;
+   bool debug = (sctx->debug.debug_message && !sctx->debug.async) || sctx->is_debug ||
+                si_can_dump_shader(sctx->screen, processor);
+
+   if (debug) {
+      u_async_debug_init(&async_debug);
+      compiler_ctx_state->debug = async_debug.base;
+   }
+
+   util_queue_add_job(&sctx->screen->shader_compiler_queue, job, ready_fence, execute, NULL, 0);
+
+   if (debug) {
+      util_queue_fence_wait(ready_fence);
+      u_async_debug_drain(&async_debug, &sctx->debug);
+      u_async_debug_cleanup(&async_debug);
+   }
+
+   if (sctx->screen->options.sync_compile)
+      util_queue_fence_wait(ready_fence);
  }
  
  /* Return descriptor slot usage masks from the given shader info. */
-void si_get_active_slot_masks(const struct si_shader_info *info,
-                             uint32_t *const_and_shader_buffers,
-                             uint64_t *samplers_and_images)
-{
-       unsigned start, num_shaderbufs, num_constbufs, num_images, num_msaa_images, num_samplers;
-
-       num_shaderbufs = util_last_bit(info->shader_buffers_declared);
-       num_constbufs = util_last_bit(info->const_buffers_declared);
-       /* two 8-byte images share one 16-byte slot */
-       num_images = align(util_last_bit(info->images_declared), 2);
-       num_msaa_images = align(util_last_bit(info->msaa_images_declared), 2);
-       num_samplers = util_last_bit(info->samplers_declared);
-
-       /* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */
-       start = si_get_shaderbuf_slot(num_shaderbufs - 1);
-       *const_and_shader_buffers =
-               u_bit_consecutive(start, num_shaderbufs + num_constbufs);
-
-       /* The layout is:
-        *   - fmask[last] ... fmask[0]     go to [15-last .. 15]
-        *   - image[last] ... image[0]     go to [31-last .. 31]
-        *   - sampler[0] ... sampler[last] go to [32 .. 32+last*2]
-        *
-        * FMASKs for images are placed separately, because MSAA images are rare,
-        * and so we can benefit from a better cache hit rate if we keep image
-        * descriptors together.
-        */
-       if (num_msaa_images)
-               num_images = SI_NUM_IMAGES + num_msaa_images; /* add FMASK descriptors */
-
-       start = si_get_image_slot(num_images - 1) / 2;
-       *samplers_and_images =
-               u_bit_consecutive64(start, num_images / 2 + num_samplers);
+void si_get_active_slot_masks(const struct si_shader_info *info, uint32_t *const_and_shader_buffers,
+                              uint64_t *samplers_and_images)
+{
+   unsigned start, num_shaderbufs, num_constbufs, num_images, num_msaa_images, num_samplers;
+
+   num_shaderbufs = util_last_bit(info->shader_buffers_declared);
+   num_constbufs = util_last_bit(info->const_buffers_declared);
+   /* two 8-byte images share one 16-byte slot */
+   num_images = align(util_last_bit(info->images_declared), 2);
+   num_msaa_images = align(util_last_bit(info->msaa_images_declared), 2);
+   num_samplers = util_last_bit(info->samplers_declared);
+
+   /* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */
+   start = si_get_shaderbuf_slot(num_shaderbufs - 1);
+   *const_and_shader_buffers = u_bit_consecutive(start, num_shaderbufs + num_constbufs);
+
+   /* The layout is:
+    *   - fmask[last] ... fmask[0]     go to [15-last .. 15]
+    *   - image[last] ... image[0]     go to [31-last .. 31]
+    *   - sampler[0] ... sampler[last] go to [32 .. 32+last*2]
+    *
+    * FMASKs for images are placed separately, because MSAA images are rare,
+    * and so we can benefit from a better cache hit rate if we keep image
+    * descriptors together.
+    */
+   if (num_msaa_images)
+      num_images = SI_NUM_IMAGES + num_msaa_images; /* add FMASK descriptors */
+
+   start = si_get_image_slot(num_images - 1) / 2;
+   *samplers_and_images = u_bit_consecutive64(start, num_images / 2 + num_samplers);
  }
  
  static void *si_create_shader_selector(struct pipe_context *ctx,
-                                      const struct pipe_shader_state *state)
-{
-       struct si_screen *sscreen = (struct si_screen *)ctx->screen;
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector);
-       int i;
-
-       if (!sel)
-               return NULL;
-
-       sel->screen = sscreen;
-       sel->compiler_ctx_state.debug = sctx->debug;
-       sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
-
-       sel->so = state->stream_output;
-
-       if (state->type == PIPE_SHADER_IR_TGSI) {
-               sel->nir = tgsi_to_nir(state->tokens, ctx->screen);
-       } else {
-               assert(state->type == PIPE_SHADER_IR_NIR);
-               sel->nir = state->ir.nir;
-       }
-
-       si_nir_scan_shader(sel->nir, &sel->info);
-       si_nir_adjust_driver_locations(sel->nir);
-
-       sel->type = sel->info.processor;
-       p_atomic_inc(&sscreen->num_shaders_created);
-       si_get_active_slot_masks(&sel->info,
-                                &sel->active_const_and_shader_buffers,
-                                &sel->active_samplers_and_images);
-
-       /* Record which streamout buffers are enabled. */
-       for (i = 0; i < sel->so.num_outputs; i++) {
-               sel->enabled_streamout_buffer_mask |=
-                       (1 << sel->so.output[i].output_buffer) <<
-                       (sel->so.output[i].stream * 4);
-       }
-
-       sel->num_vs_inputs = sel->type == PIPE_SHADER_VERTEX &&
-                            !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] ?
-                                    sel->info.num_inputs : 0;
-       sel->num_vbos_in_user_sgprs =
-               MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs);
-
-       /* The prolog is a no-op if there are no inputs. */
-       sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX &&
-                              sel->info.num_inputs &&
-                              !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
-
-       sel->prim_discard_cs_allowed =
-               sel->type == PIPE_SHADER_VERTEX &&
-               !sel->info.uses_bindless_images &&
-               !sel->info.uses_bindless_samplers &&
-               !sel->info.writes_memory &&
-               !sel->info.writes_viewport_index &&
-               !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
-               !sel->so.num_outputs;
-
-       switch (sel->type) {
-       case PIPE_SHADER_GEOMETRY:
-               sel->gs_output_prim =
-                       sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM];
-
-               /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
-               sel->rast_prim = sel->gs_output_prim;
-               if (util_rast_prim_is_triangles(sel->rast_prim))
-                       sel->rast_prim = PIPE_PRIM_TRIANGLES;
-
-               sel->gs_max_out_vertices =
-                       sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
-               sel->gs_num_invocations =
-                       sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
-               sel->gsvs_vertex_size = sel->info.num_outputs * 16;
-               sel->max_gsvs_emit_size = sel->gsvs_vertex_size *
-                                         sel->gs_max_out_vertices;
-
-               sel->max_gs_stream = 0;
-               for (i = 0; i < sel->so.num_outputs; i++)
-                       sel->max_gs_stream = MAX2(sel->max_gs_stream,
-                                                 sel->so.output[i].stream);
-
-               sel->gs_input_verts_per_prim =
-                       u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]);
-
-               /* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation. */
-               sel->tess_turns_off_ngg =
-                       sscreen->info.chip_class == GFX10 &&
-                       sel->gs_num_invocations * sel->gs_max_out_vertices > 256;
-               break;
-
-       case PIPE_SHADER_TESS_CTRL:
-               /* Always reserve space for these. */
-               sel->patch_outputs_written |=
-                       (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0)) |
-                       (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0));
-               /* fall through */
-       case PIPE_SHADER_VERTEX:
-       case PIPE_SHADER_TESS_EVAL:
-               for (i = 0; i < sel->info.num_outputs; i++) {
-                       unsigned name = sel->info.output_semantic_name[i];
-                       unsigned index = sel->info.output_semantic_index[i];
-
-                       switch (name) {
-                       case TGSI_SEMANTIC_TESSINNER:
-                       case TGSI_SEMANTIC_TESSOUTER:
-                       case TGSI_SEMANTIC_PATCH:
-                               sel->patch_outputs_written |=
-                                       1ull << si_shader_io_get_unique_index_patch(name, index);
-                               break;
-
-                       case TGSI_SEMANTIC_GENERIC:
-                               /* don't process indices the function can't handle */
-                               if (index >= SI_MAX_IO_GENERIC)
-                                       break;
-                               /* fall through */
-                       default:
-                               sel->outputs_written |=
-                                       1ull << si_shader_io_get_unique_index(name, index, false);
-                               sel->outputs_written_before_ps |=
-                                       1ull << si_shader_io_get_unique_index(name, index, true);
-                               break;
-                       case TGSI_SEMANTIC_EDGEFLAG:
-                               break;
-                       }
-               }
-               sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
-               sel->lshs_vertex_stride = sel->esgs_itemsize;
-
-               /* Add 1 dword to reduce LDS bank conflicts, so that each vertex
-                * will start on a different bank. (except for the maximum 32*16).
-                */
-               if (sel->lshs_vertex_stride < 32*16)
-                       sel->lshs_vertex_stride += 4;
-
-               /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
-                * conflicts, i.e. each vertex will start at a different bank.
-                */
-               if (sctx->chip_class >= GFX9)
-                       sel->esgs_itemsize += 4;
-
-               assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
-
-               /* Only for TES: */
-               if (sel->info.properties[TGSI_PROPERTY_TES_POINT_MODE])
-                       sel->rast_prim = PIPE_PRIM_POINTS;
-               else if (sel->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
-                       sel->rast_prim = PIPE_PRIM_LINE_STRIP;
-               else
-                       sel->rast_prim = PIPE_PRIM_TRIANGLES;
-               break;
-
-       case PIPE_SHADER_FRAGMENT:
-               for (i = 0; i < sel->info.num_inputs; i++) {
-                       unsigned name = sel->info.input_semantic_name[i];
-                       unsigned index = sel->info.input_semantic_index[i];
-
-                       switch (name) {
-                       case TGSI_SEMANTIC_GENERIC:
-                               /* don't process indices the function can't handle */
-                               if (index >= SI_MAX_IO_GENERIC)
-                                       break;
-                               /* fall through */
-                       default:
-                               sel->inputs_read |=
-                                       1ull << si_shader_io_get_unique_index(name, index, true);
-                               break;
-                       case TGSI_SEMANTIC_PCOORD: /* ignore this */
-                               break;
-                       }
-               }
-
-               for (i = 0; i < 8; i++)
-                       if (sel->info.colors_written & (1 << i))
-                               sel->colors_written_4bit |= 0xf << (4 * i);
-
-               for (i = 0; i < sel->info.num_inputs; i++) {
-                       if (sel->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR) {
-                               int index = sel->info.input_semantic_index[i];
-                               sel->color_attr_index[index] = i;
-                       }
-               }
-               break;
-       default:;
-       }
-
-       sel->ngg_culling_allowed =
-               sscreen->info.chip_class == GFX10 &&
-               sscreen->info.has_dedicated_vram &&
-               sscreen->use_ngg_culling &&
-               /* Disallow TES by default, because TessMark results are mixed. */
-               (sel->type == PIPE_SHADER_VERTEX ||
-                (sscreen->always_use_ngg_culling && sel->type == PIPE_SHADER_TESS_EVAL)) &&
-               sel->info.writes_position &&
-               !sel->info.writes_viewport_index && /* cull only against viewport 0 */
-               !sel->info.writes_memory &&
-               !sel->so.num_outputs &&
-               !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] &&
-               !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-
-       /* PA_CL_VS_OUT_CNTL */
-       if (sctx->chip_class <= GFX9)
-               sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, false);
-
-       sel->clipdist_mask = sel->info.writes_clipvertex ?
-                                    SIX_BITS : sel->info.clipdist_writemask;
-       sel->culldist_mask = sel->info.culldist_writemask <<
-                            sel->info.num_written_clipdistance;
-
-       /* DB_SHADER_CONTROL */
-       sel->db_shader_control =
-               S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) |
-               S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) |
-               S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) |
-               S_02880C_KILL_ENABLE(sel->info.uses_kill);
-
-       switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
-       case TGSI_FS_DEPTH_LAYOUT_GREATER:
-               sel->db_shader_control |=
-                       S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
-               break;
-       case TGSI_FS_DEPTH_LAYOUT_LESS:
-               sel->db_shader_control |=
-                       S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
-               break;
-       }
-
-       /* Z_ORDER, EXEC_ON_HIER_FAIL and EXEC_ON_NOOP should be set as following:
-        *
-        *   | early Z/S | writes_mem | allow_ReZ? |      Z_ORDER       | EXEC_ON_HIER_FAIL | EXEC_ON_NOOP
-        * --|-----------|------------|------------|--------------------|-------------------|-------------
-        * 1a|   false   |   false    |   true     | EarlyZ_Then_ReZ    |         0         |     0
-        * 1b|   false   |   false    |   false    | EarlyZ_Then_LateZ  |         0         |     0
-        * 2 |   false   |   true     |   n/a      |       LateZ        |         1         |     0
-        * 3 |   true    |   false    |   n/a      | EarlyZ_Then_LateZ  |         0         |     0
-        * 4 |   true    |   true     |   n/a      | EarlyZ_Then_LateZ  |         0         |     1
-        *
-        * In cases 3 and 4, HW will force Z_ORDER to EarlyZ regardless of what's set in the register.
-        * In case 2, NOOP_CULL is a don't care field. In case 2, 3 and 4, ReZ doesn't make sense.
-        *
-        * Don't use ReZ without profiling !!!
-        *
-        * ReZ decreases performance by 15% in DiRT: Showdown on Ultra settings, which has pretty complex
-        * shaders.
-        */
-       if (sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) {
-               /* Cases 3, 4. */
-               sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1) |
-                                         S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) |
-                                         S_02880C_EXEC_ON_NOOP(sel->info.writes_memory);
-       } else if (sel->info.writes_memory) {
-               /* Case 2. */
-               sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z) |
-                                         S_02880C_EXEC_ON_HIER_FAIL(1);
-       } else {
-               /* Case 1. */
-               sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
-       }
-
-       if (sel->info.properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE])
-               sel->db_shader_control |= S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(1);
-
-       (void) simple_mtx_init(&sel->mutex, mtx_plain);
-
-       si_schedule_initial_compile(sctx, sel->info.processor, &sel->ready,
-                                   &sel->compiler_ctx_state, sel,
-                                   si_init_shader_selector_async);
-       return sel;
-}
-
-static void *si_create_shader(struct pipe_context *ctx,
-                             const struct pipe_shader_state *state)
-{
-       struct si_screen *sscreen = (struct si_screen *)ctx->screen;
-
-       return util_live_shader_cache_get(ctx, &sscreen->live_shader_cache, state);
+                                       const struct pipe_shader_state *state)
+{
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector);
+   int i;
+
+   if (!sel)
+      return NULL;
+
+   sel->screen = sscreen;
+   sel->compiler_ctx_state.debug = sctx->debug;
+   sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
+
+   sel->so = state->stream_output;
+
+   if (state->type == PIPE_SHADER_IR_TGSI) {
+      sel->nir = tgsi_to_nir(state->tokens, ctx->screen);
+   } else {
+      assert(state->type == PIPE_SHADER_IR_NIR);
+      sel->nir = state->ir.nir;
+   }
+
+   si_nir_scan_shader(sel->nir, &sel->info);
+   si_nir_adjust_driver_locations(sel->nir);
+
+   sel->type = sel->info.processor;
+   p_atomic_inc(&sscreen->num_shaders_created);
+   si_get_active_slot_masks(&sel->info, &sel->active_const_and_shader_buffers,
+                            &sel->active_samplers_and_images);
+
+   /* Record which streamout buffers are enabled. */
+   for (i = 0; i < sel->so.num_outputs; i++) {
+      sel->enabled_streamout_buffer_mask |= (1 << sel->so.output[i].output_buffer)
+                                            << (sel->so.output[i].stream * 4);
+   }
+
+   sel->num_vs_inputs =
+      sel->type == PIPE_SHADER_VERTEX && !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]
+         ? sel->info.num_inputs
+         : 0;
+   sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs);
+
+   /* The prolog is a no-op if there are no inputs. */
+   sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX && sel->info.num_inputs &&
+                          !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+
+   sel->prim_discard_cs_allowed =
+      sel->type == PIPE_SHADER_VERTEX && !sel->info.uses_bindless_images &&
+      !sel->info.uses_bindless_samplers && !sel->info.writes_memory &&
+      !sel->info.writes_viewport_index &&
+      !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] && !sel->so.num_outputs;
+
+   switch (sel->type) {
+   case PIPE_SHADER_GEOMETRY:
+      sel->gs_output_prim = sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM];
+
+      /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
+      sel->rast_prim = sel->gs_output_prim;
+      if (util_rast_prim_is_triangles(sel->rast_prim))
+         sel->rast_prim = PIPE_PRIM_TRIANGLES;
+
+      sel->gs_max_out_vertices = sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
+      sel->gs_num_invocations = sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
+      sel->gsvs_vertex_size = sel->info.num_outputs * 16;
+      sel->max_gsvs_emit_size = sel->gsvs_vertex_size * sel->gs_max_out_vertices;
+
+      sel->max_gs_stream = 0;
+      for (i = 0; i < sel->so.num_outputs; i++)
+         sel->max_gs_stream = MAX2(sel->max_gs_stream, sel->so.output[i].stream);
+
+      sel->gs_input_verts_per_prim =
+         u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]);
+
+      /* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation. */
+      sel->tess_turns_off_ngg = sscreen->info.chip_class == GFX10 &&
+                                sel->gs_num_invocations * sel->gs_max_out_vertices > 256;
+      break;
+
+   case PIPE_SHADER_TESS_CTRL:
+      /* Always reserve space for these. */
+      sel->patch_outputs_written |=
+         (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0)) |
+         (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0));
+      /* fall through */
+   case PIPE_SHADER_VERTEX:
+   case PIPE_SHADER_TESS_EVAL:
+      for (i = 0; i < sel->info.num_outputs; i++) {
+         unsigned name = sel->info.output_semantic_name[i];
+         unsigned index = sel->info.output_semantic_index[i];
+
+         switch (name) {
+         case TGSI_SEMANTIC_TESSINNER:
+         case TGSI_SEMANTIC_TESSOUTER:
+         case TGSI_SEMANTIC_PATCH:
+            sel->patch_outputs_written |= 1ull << si_shader_io_get_unique_index_patch(name, index);
+            break;
+
+         case TGSI_SEMANTIC_GENERIC:
+            /* don't process indices the function can't handle */
+            if (index >= SI_MAX_IO_GENERIC)
+               break;
+            /* fall through */
+         default:
+            sel->outputs_written |= 1ull << si_shader_io_get_unique_index(name, index, false);
+            sel->outputs_written_before_ps |= 1ull
+                                              << si_shader_io_get_unique_index(name, index, true);
+            break;
+         case TGSI_SEMANTIC_EDGEFLAG:
+            break;
+         }
+      }
+      sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
+      sel->lshs_vertex_stride = sel->esgs_itemsize;
+
+      /* Add 1 dword to reduce LDS bank conflicts, so that each vertex
+       * will start on a different bank. (except for the maximum 32*16).
+       */
+      if (sel->lshs_vertex_stride < 32 * 16)
+         sel->lshs_vertex_stride += 4;
+
+      /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
+       * conflicts, i.e. each vertex will start at a different bank.
+       */
+      if (sctx->chip_class >= GFX9)
+         sel->esgs_itemsize += 4;
+
+      assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
+
+      /* Only for TES: */
+      if (sel->info.properties[TGSI_PROPERTY_TES_POINT_MODE])
+         sel->rast_prim = PIPE_PRIM_POINTS;
+      else if (sel->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
+         sel->rast_prim = PIPE_PRIM_LINE_STRIP;
+      else
+         sel->rast_prim = PIPE_PRIM_TRIANGLES;
+      break;
+
+   case PIPE_SHADER_FRAGMENT:
+      for (i = 0; i < sel->info.num_inputs; i++) {
+         unsigned name = sel->info.input_semantic_name[i];
+         unsigned index = sel->info.input_semantic_index[i];
+
+         switch (name) {
+         case TGSI_SEMANTIC_GENERIC:
+            /* don't process indices the function can't handle */
+            if (index >= SI_MAX_IO_GENERIC)
+               break;
+            /* fall through */
+         default:
+            sel->inputs_read |= 1ull << si_shader_io_get_unique_index(name, index, true);
+            break;
+         case TGSI_SEMANTIC_PCOORD: /* ignore this */
+            break;
+         }
+      }
+
+      for (i = 0; i < 8; i++)
+         if (sel->info.colors_written & (1 << i))
+            sel->colors_written_4bit |= 0xf << (4 * i);
+
+      for (i = 0; i < sel->info.num_inputs; i++) {
+         if (sel->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR) {
+            int index = sel->info.input_semantic_index[i];
+            sel->color_attr_index[index] = i;
+         }
+      }
+      break;
+   default:;
+   }
+
+   sel->ngg_culling_allowed =
+      sscreen->info.chip_class == GFX10 && sscreen->info.has_dedicated_vram &&
+      sscreen->use_ngg_culling &&
+      /* Disallow TES by default, because TessMark results are mixed. */
+      (sel->type == PIPE_SHADER_VERTEX ||
+       (sscreen->always_use_ngg_culling && sel->type == PIPE_SHADER_TESS_EVAL)) &&
+      sel->info.writes_position &&
+      !sel->info.writes_viewport_index && /* cull only against viewport 0 */
+      !sel->info.writes_memory && !sel->so.num_outputs &&
+      !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] &&
+      !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+
+   /* PA_CL_VS_OUT_CNTL */
+   if (sctx->chip_class <= GFX9)
+      sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, false);
+
+   sel->clipdist_mask = sel->info.writes_clipvertex ? SIX_BITS : sel->info.clipdist_writemask;
+   sel->culldist_mask = sel->info.culldist_writemask << sel->info.num_written_clipdistance;
+
+   /* DB_SHADER_CONTROL */
+   sel->db_shader_control = S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) |
+                            S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) |
+                            S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) |
+                            S_02880C_KILL_ENABLE(sel->info.uses_kill);
+
+   switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
+   case TGSI_FS_DEPTH_LAYOUT_GREATER:
+      sel->db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
+      break;
+   case TGSI_FS_DEPTH_LAYOUT_LESS:
+      sel->db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
+      break;
+   }
+
+   /* Z_ORDER, EXEC_ON_HIER_FAIL and EXEC_ON_NOOP should be set as following:
+    *
+    *   | early Z/S | writes_mem | allow_ReZ? |      Z_ORDER       | EXEC_ON_HIER_FAIL | EXEC_ON_NOOP
+    * --|-----------|------------|------------|--------------------|-------------------|-------------
+    * 1a|   false   |   false    |   true     | EarlyZ_Then_ReZ    |         0         |     0
+    * 1b|   false   |   false    |   false    | EarlyZ_Then_LateZ  |         0         |     0
+    * 2 |   false   |   true     |   n/a      |       LateZ        |         1         |     0
+    * 3 |   true    |   false    |   n/a      | EarlyZ_Then_LateZ  |         0         |     0
+    * 4 |   true    |   true     |   n/a      | EarlyZ_Then_LateZ  |         0         |     1
+    *
+    * In cases 3 and 4, HW will force Z_ORDER to EarlyZ regardless of what's set in the register.
+    * In case 2, NOOP_CULL is a don't care field. In case 2, 3 and 4, ReZ doesn't make sense.
+    *
+    * Don't use ReZ without profiling !!!
+    *
+    * ReZ decreases performance by 15% in DiRT: Showdown on Ultra settings, which has pretty complex
+    * shaders.
+    */
+   if (sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) {
+      /* Cases 3, 4. */
+      sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1) |
+                                S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) |
+                                S_02880C_EXEC_ON_NOOP(sel->info.writes_memory);
+   } else if (sel->info.writes_memory) {
+      /* Case 2. */
+      sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z) | S_02880C_EXEC_ON_HIER_FAIL(1);
+   } else {
+      /* Case 1. */
+      sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
+   }
+
+   if (sel->info.properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE])
+      sel->db_shader_control |= S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(1);
+
+   (void)simple_mtx_init(&sel->mutex, mtx_plain);
+
+   si_schedule_initial_compile(sctx, sel->info.processor, &sel->ready, &sel->compiler_ctx_state,
+                               sel, si_init_shader_selector_async);
+   return sel;
+}
+
+static void *si_create_shader(struct pipe_context *ctx, const struct pipe_shader_state *state)
+{
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+
+   return util_live_shader_cache_get(ctx, &sscreen->live_shader_cache, state);
  }
  
  static void si_update_streamout_state(struct si_context *sctx)
  {
-       struct si_shader_selector *shader_with_so = si_get_vs(sctx)->cso;
+   struct si_shader_selector *shader_with_so = si_get_vs(sctx)->cso;
  
-       if (!shader_with_so)
-               return;
+   if (!shader_with_so)
+      return;
  
-       sctx->streamout.enabled_stream_buffers_mask =
-               shader_with_so->enabled_streamout_buffer_mask;
-       sctx->streamout.stride_in_dw = shader_with_so->so.stride;
+   sctx->streamout.enabled_stream_buffers_mask = shader_with_so->enabled_streamout_buffer_mask;
+   sctx->streamout.stride_in_dw = shader_with_so->so.stride;
  }
  
-static void si_update_clip_regs(struct si_context *sctx,
-                               struct si_shader_selector *old_hw_vs,
-                               struct si_shader *old_hw_vs_variant,
-                               struct si_shader_selector *next_hw_vs,
-                               struct si_shader *next_hw_vs_variant)
+static void si_update_clip_regs(struct si_context *sctx, struct si_shader_selector *old_hw_vs,
+                                struct si_shader *old_hw_vs_variant,
+                                struct si_shader_selector *next_hw_vs,
+                                struct si_shader *next_hw_vs_variant)
  {
-       if (next_hw_vs &&
-           (!old_hw_vs ||
-            old_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] !=
-            next_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] ||
-            old_hw_vs->pa_cl_vs_out_cntl != next_hw_vs->pa_cl_vs_out_cntl ||
-            old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask ||
-            old_hw_vs->culldist_mask != next_hw_vs->culldist_mask ||
-            !old_hw_vs_variant ||
-            !next_hw_vs_variant ||
-            old_hw_vs_variant->key.opt.clip_disable !=
-            next_hw_vs_variant->key.opt.clip_disable))
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
+   if (next_hw_vs &&
+       (!old_hw_vs ||
+        old_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] !=
+           next_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] ||
+        old_hw_vs->pa_cl_vs_out_cntl != next_hw_vs->pa_cl_vs_out_cntl ||
+        old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask ||
+        old_hw_vs->culldist_mask != next_hw_vs->culldist_mask || !old_hw_vs_variant ||
+        !next_hw_vs_variant ||
+        old_hw_vs_variant->key.opt.clip_disable != next_hw_vs_variant->key.opt.clip_disable))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
  }
  
  static void si_update_common_shader_state(struct si_context *sctx)
  {
-       sctx->uses_bindless_samplers =
-               si_shader_uses_bindless_samplers(sctx->vs_shader.cso)  ||
-               si_shader_uses_bindless_samplers(sctx->gs_shader.cso)  ||
-               si_shader_uses_bindless_samplers(sctx->ps_shader.cso)  ||
-               si_shader_uses_bindless_samplers(sctx->tcs_shader.cso) ||
-               si_shader_uses_bindless_samplers(sctx->tes_shader.cso);
-       sctx->uses_bindless_images =
-               si_shader_uses_bindless_images(sctx->vs_shader.cso)  ||
-               si_shader_uses_bindless_images(sctx->gs_shader.cso)  ||
-               si_shader_uses_bindless_images(sctx->ps_shader.cso)  ||
-               si_shader_uses_bindless_images(sctx->tcs_shader.cso) ||
-               si_shader_uses_bindless_images(sctx->tes_shader.cso);
-       sctx->do_update_shaders = true;
+   sctx->uses_bindless_samplers = si_shader_uses_bindless_samplers(sctx->vs_shader.cso) ||
+                                  si_shader_uses_bindless_samplers(sctx->gs_shader.cso) ||
+                                  si_shader_uses_bindless_samplers(sctx->ps_shader.cso) ||
+                                  si_shader_uses_bindless_samplers(sctx->tcs_shader.cso) ||
+                                  si_shader_uses_bindless_samplers(sctx->tes_shader.cso);
+   sctx->uses_bindless_images = si_shader_uses_bindless_images(sctx->vs_shader.cso) ||
+                                si_shader_uses_bindless_images(sctx->gs_shader.cso) ||
+                                si_shader_uses_bindless_images(sctx->ps_shader.cso) ||
+                                si_shader_uses_bindless_images(sctx->tcs_shader.cso) ||
+                                si_shader_uses_bindless_images(sctx->tes_shader.cso);
+   sctx->do_update_shaders = true;
  }
  
  static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
-       struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
-       struct si_shader_selector *sel = state;
-
-       if (sctx->vs_shader.cso == sel)
-               return;
-
-       sctx->vs_shader.cso = sel;
-       sctx->vs_shader.current = sel ? sel->first_variant : NULL;
-       sctx->num_vs_blit_sgprs = sel ? sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] : 0;
-
-       if (si_update_ngg(sctx))
-               si_shader_change_notify(sctx);
-
-       si_update_common_shader_state(sctx);
-       si_update_vs_viewport_state(sctx);
-       si_set_active_descriptors_for_shader(sctx, sel);
-       si_update_streamout_state(sctx);
-       si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant,
-                           si_get_vs(sctx)->cso, si_get_vs_state(sctx));
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+   struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
+   struct si_shader_selector *sel = state;
+
+   if (sctx->vs_shader.cso == sel)
+      return;
+
+   sctx->vs_shader.cso = sel;
+   sctx->vs_shader.current = sel ? sel->first_variant : NULL;
+   sctx->num_vs_blit_sgprs = sel ? sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] : 0;
+
+   if (si_update_ngg(sctx))
+      si_shader_change_notify(sctx);
+
+   si_update_common_shader_state(sctx);
+   si_update_vs_viewport_state(sctx);
+   si_set_active_descriptors_for_shader(sctx, sel);
+   si_update_streamout_state(sctx);
+   si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
+                       si_get_vs_state(sctx));
  }
  
  static void si_update_tess_uses_prim_id(struct si_context *sctx)
  {
-       sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
-               (sctx->tes_shader.cso &&
-                sctx->tes_shader.cso->info.uses_primid) ||
-               (sctx->tcs_shader.cso &&
-                sctx->tcs_shader.cso->info.uses_primid) ||
-               (sctx->gs_shader.cso &&
-                sctx->gs_shader.cso->info.uses_primid) ||
-               (sctx->ps_shader.cso && !sctx->gs_shader.cso &&
-                sctx->ps_shader.cso->info.uses_primid);
+   sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
+      (sctx->tes_shader.cso && sctx->tes_shader.cso->info.uses_primid) ||
+      (sctx->tcs_shader.cso && sctx->tcs_shader.cso->info.uses_primid) ||
+      (sctx->gs_shader.cso && sctx->gs_shader.cso->info.uses_primid) ||
+      (sctx->ps_shader.cso && !sctx->gs_shader.cso && sctx->ps_shader.cso->info.uses_primid);
  }
  
  bool si_update_ngg(struct si_context *sctx)
  {
-       if (!sctx->screen->use_ngg) {
-               assert(!sctx->ngg);
-               return false;
-       }
-
-       bool new_ngg = true;
-
-       if (sctx->gs_shader.cso && sctx->tes_shader.cso &&
-           sctx->gs_shader.cso->tess_turns_off_ngg) {
-               new_ngg = false;
-       } else if (!sctx->screen->use_ngg_streamout) {
-               struct si_shader_selector *last = si_get_vs(sctx)->cso;
-
-               if ((last && last->so.num_outputs) ||
-                   sctx->streamout.prims_gen_query_enabled)
-                       new_ngg = false;
-       }
-
-       if (new_ngg != sctx->ngg) {
-               /* Transitioning from NGG to legacy GS requires VGT_FLUSH on Navi10-14.
-                * VGT_FLUSH is also emitted at the beginning of IBs when legacy GS ring
-                * pointers are set.
-                */
-               if ((sctx->family == CHIP_NAVI10 ||
-                    sctx->family == CHIP_NAVI12 ||
-                    sctx->family == CHIP_NAVI14) &&
-                   !new_ngg)
-                       sctx->flags |= SI_CONTEXT_VGT_FLUSH;
-
-               sctx->ngg = new_ngg;
-               sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
-               return true;
-       }
-       return false;
+   if (!sctx->screen->use_ngg) {
+      assert(!sctx->ngg);
+      return false;
+   }
+
+   bool new_ngg = true;
+
+   if (sctx->gs_shader.cso && sctx->tes_shader.cso && sctx->gs_shader.cso->tess_turns_off_ngg) {
+      new_ngg = false;
+   } else if (!sctx->screen->use_ngg_streamout) {
+      struct si_shader_selector *last = si_get_vs(sctx)->cso;
+
+      if ((last && last->so.num_outputs) || sctx->streamout.prims_gen_query_enabled)
+         new_ngg = false;
+   }
+
+   if (new_ngg != sctx->ngg) {
+      /* Transitioning from NGG to legacy GS requires VGT_FLUSH on Navi10-14.
+       * VGT_FLUSH is also emitted at the beginning of IBs when legacy GS ring
+       * pointers are set.
+       */
+      if ((sctx->family == CHIP_NAVI10 || sctx->family == CHIP_NAVI12 ||
+           sctx->family == CHIP_NAVI14) &&
+          !new_ngg)
+         sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+
+      sctx->ngg = new_ngg;
+      sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
+      return true;
+   }
+   return false;
  }
  
  static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
-       struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
-       struct si_shader_selector *sel = state;
-       bool enable_changed = !!sctx->gs_shader.cso != !!sel;
-       bool ngg_changed;
-
-       if (sctx->gs_shader.cso == sel)
-               return;
-
-       sctx->gs_shader.cso = sel;
-       sctx->gs_shader.current = sel ? sel->first_variant : NULL;
-       sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
-
-       si_update_common_shader_state(sctx);
-       sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
-
-       ngg_changed = si_update_ngg(sctx);
-       if (ngg_changed || enable_changed)
-               si_shader_change_notify(sctx);
-       if (enable_changed) {
-               if (sctx->ia_multi_vgt_param_key.u.uses_tess)
-                       si_update_tess_uses_prim_id(sctx);
-       }
-       si_update_vs_viewport_state(sctx);
-       si_set_active_descriptors_for_shader(sctx, sel);
-       si_update_streamout_state(sctx);
-       si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant,
-                           si_get_vs(sctx)->cso, si_get_vs_state(sctx));
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+   struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
+   struct si_shader_selector *sel = state;
+   bool enable_changed = !!sctx->gs_shader.cso != !!sel;
+   bool ngg_changed;
+
+   if (sctx->gs_shader.cso == sel)
+      return;
+
+   sctx->gs_shader.cso = sel;
+   sctx->gs_shader.current = sel ? sel->first_variant : NULL;
+   sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
+
+   si_update_common_shader_state(sctx);
+   sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
+
+   ngg_changed = si_update_ngg(sctx);
+   if (ngg_changed || enable_changed)
+      si_shader_change_notify(sctx);
+   if (enable_changed) {
+      if (sctx->ia_multi_vgt_param_key.u.uses_tess)
+         si_update_tess_uses_prim_id(sctx);
+   }
+   si_update_vs_viewport_state(sctx);
+   si_set_active_descriptors_for_shader(sctx, sel);
+   si_update_streamout_state(sctx);
+   si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
+                       si_get_vs_state(sctx));
  }
  
  static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_shader_selector *sel = state;
-       bool enable_changed = !!sctx->tcs_shader.cso != !!sel;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *sel = state;
+   bool enable_changed = !!sctx->tcs_shader.cso != !!sel;
  
-       if (sctx->tcs_shader.cso == sel)
-               return;
+   if (sctx->tcs_shader.cso == sel)
+      return;
  
-       sctx->tcs_shader.cso = sel;
-       sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
-       si_update_tess_uses_prim_id(sctx);
+   sctx->tcs_shader.cso = sel;
+   sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
+   si_update_tess_uses_prim_id(sctx);
  
-       si_update_common_shader_state(sctx);
+   si_update_common_shader_state(sctx);
  
-       if (enable_changed)
-               sctx->last_tcs = NULL; /* invalidate derived tess state */
+   if (enable_changed)
+      sctx->last_tcs = NULL; /* invalidate derived tess state */
  
-       si_set_active_descriptors_for_shader(sctx, sel);
+   si_set_active_descriptors_for_shader(sctx, sel);
  }
  
  static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
-       struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
-       struct si_shader_selector *sel = state;
-       bool enable_changed = !!sctx->tes_shader.cso != !!sel;
-
-       if (sctx->tes_shader.cso == sel)
-               return;
-
-       sctx->tes_shader.cso = sel;
-       sctx->tes_shader.current = sel ? sel->first_variant : NULL;
-       sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL;
-       si_update_tess_uses_prim_id(sctx);
-
-       si_update_common_shader_state(sctx);
-       sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
-
-       bool ngg_changed = si_update_ngg(sctx);
-       if (ngg_changed || enable_changed)
-               si_shader_change_notify(sctx);
-       if (enable_changed)
-               sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
-       si_update_vs_viewport_state(sctx);
-       si_set_active_descriptors_for_shader(sctx, sel);
-       si_update_streamout_state(sctx);
-       si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant,
-                           si_get_vs(sctx)->cso, si_get_vs_state(sctx));
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+   struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
+   struct si_shader_selector *sel = state;
+   bool enable_changed = !!sctx->tes_shader.cso != !!sel;
+
+   if (sctx->tes_shader.cso == sel)
+      return;
+
+   sctx->tes_shader.cso = sel;
+   sctx->tes_shader.current = sel ? sel->first_variant : NULL;
+   sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL;
+   si_update_tess_uses_prim_id(sctx);
+
+   si_update_common_shader_state(sctx);
+   sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
+
+   bool ngg_changed = si_update_ngg(sctx);
+   if (ngg_changed || enable_changed)
+      si_shader_change_notify(sctx);
+   if (enable_changed)
+      sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
+   si_update_vs_viewport_state(sctx);
+   si_set_active_descriptors_for_shader(sctx, sel);
+   si_update_streamout_state(sctx);
+   si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
+                       si_get_vs_state(sctx));
  }
  
  static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_shader_selector *old_sel = sctx->ps_shader.cso;
-       struct si_shader_selector *sel = state;
-
-       /* skip if supplied shader is one already in use */
-       if (old_sel == sel)
-               return;
-
-       sctx->ps_shader.cso = sel;
-       sctx->ps_shader.current = sel ? sel->first_variant : NULL;
-
-       si_update_common_shader_state(sctx);
-       if (sel) {
-               if (sctx->ia_multi_vgt_param_key.u.uses_tess)
-                       si_update_tess_uses_prim_id(sctx);
-
-               if (!old_sel ||
-                   old_sel->info.colors_written != sel->info.colors_written)
-                       si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
-
-               if (sctx->screen->has_out_of_order_rast &&
-                   (!old_sel ||
-                    old_sel->info.writes_memory != sel->info.writes_memory ||
-                    old_sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] !=
-                    sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]))
-                       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-       }
-       si_set_active_descriptors_for_shader(sctx, sel);
-       si_update_ps_colorbuf0_slot(sctx);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *old_sel = sctx->ps_shader.cso;
+   struct si_shader_selector *sel = state;
+
+   /* skip if supplied shader is one already in use */
+   if (old_sel == sel)
+      return;
+
+   sctx->ps_shader.cso = sel;
+   sctx->ps_shader.current = sel ? sel->first_variant : NULL;
+
+   si_update_common_shader_state(sctx);
+   if (sel) {
+      if (sctx->ia_multi_vgt_param_key.u.uses_tess)
+         si_update_tess_uses_prim_id(sctx);
+
+      if (!old_sel || old_sel->info.colors_written != sel->info.colors_written)
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+
+      if (sctx->screen->has_out_of_order_rast &&
+          (!old_sel || old_sel->info.writes_memory != sel->info.writes_memory ||
+           old_sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] !=
+              sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]))
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+   }
+   si_set_active_descriptors_for_shader(sctx, sel);
+   si_update_ps_colorbuf0_slot(sctx);
  }
  
  static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
  {
-       if (shader->is_optimized) {
-               util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority,
-                                   &shader->ready);
-       }
-
-       util_queue_fence_destroy(&shader->ready);
-
-       if (shader->pm4) {
-               /* If destroyed shaders were not unbound, the next compiled
-                * shader variant could get the same pointer address and so
-                * binding it to the same shader stage would be considered
-                * a no-op, causing random behavior.
-                */
-               switch (shader->selector->type) {
-               case PIPE_SHADER_VERTEX:
-                       if (shader->key.as_ls) {
-                               assert(sctx->chip_class <= GFX8);
-                               si_pm4_delete_state(sctx, ls, shader->pm4);
-                       } else if (shader->key.as_es) {
-                               assert(sctx->chip_class <= GFX8);
-                               si_pm4_delete_state(sctx, es, shader->pm4);
-                       } else if (shader->key.as_ngg) {
-                               si_pm4_delete_state(sctx, gs, shader->pm4);
-                       } else {
-                               si_pm4_delete_state(sctx, vs, shader->pm4);
-                       }
-                       break;
-               case PIPE_SHADER_TESS_CTRL:
-                       si_pm4_delete_state(sctx, hs, shader->pm4);
-                       break;
-               case PIPE_SHADER_TESS_EVAL:
-                       if (shader->key.as_es) {
-                               assert(sctx->chip_class <= GFX8);
-                               si_pm4_delete_state(sctx, es, shader->pm4);
-                       } else if (shader->key.as_ngg) {
-                               si_pm4_delete_state(sctx, gs, shader->pm4);
-                       } else {
-                               si_pm4_delete_state(sctx, vs, shader->pm4);
-                       }
-                       break;
-               case PIPE_SHADER_GEOMETRY:
-                       if (shader->is_gs_copy_shader)
-                               si_pm4_delete_state(sctx, vs, shader->pm4);
-                       else
-                               si_pm4_delete_state(sctx, gs, shader->pm4);
-                       break;
-               case PIPE_SHADER_FRAGMENT:
-                       si_pm4_delete_state(sctx, ps, shader->pm4);
-                       break;
-               default:;
-               }
-       }
-
-       si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL);
-       si_shader_destroy(shader);
-       free(shader);
+   if (shader->is_optimized) {
+      util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority, &shader->ready);
+   }
+
+   util_queue_fence_destroy(&shader->ready);
+
+   if (shader->pm4) {
+      /* If destroyed shaders were not unbound, the next compiled
+       * shader variant could get the same pointer address and so
+       * binding it to the same shader stage would be considered
+       * a no-op, causing random behavior.
+       */
+      switch (shader->selector->type) {
+      case PIPE_SHADER_VERTEX:
+         if (shader->key.as_ls) {
+            assert(sctx->chip_class <= GFX8);
+            si_pm4_delete_state(sctx, ls, shader->pm4);
+         } else if (shader->key.as_es) {
+            assert(sctx->chip_class <= GFX8);
+            si_pm4_delete_state(sctx, es, shader->pm4);
+         } else if (shader->key.as_ngg) {
+            si_pm4_delete_state(sctx, gs, shader->pm4);
+         } else {
+            si_pm4_delete_state(sctx, vs, shader->pm4);
+         }
+         break;
+      case PIPE_SHADER_TESS_CTRL:
+         si_pm4_delete_state(sctx, hs, shader->pm4);
+         break;
+      case PIPE_SHADER_TESS_EVAL:
+         if (shader->key.as_es) {
+            assert(sctx->chip_class <= GFX8);
+            si_pm4_delete_state(sctx, es, shader->pm4);
+         } else if (shader->key.as_ngg) {
+            si_pm4_delete_state(sctx, gs, shader->pm4);
+         } else {
+            si_pm4_delete_state(sctx, vs, shader->pm4);
+         }
+         break;
+      case PIPE_SHADER_GEOMETRY:
+         if (shader->is_gs_copy_shader)
+            si_pm4_delete_state(sctx, vs, shader->pm4);
+         else
+            si_pm4_delete_state(sctx, gs, shader->pm4);
+         break;
+      case PIPE_SHADER_FRAGMENT:
+         si_pm4_delete_state(sctx, ps, shader->pm4);
+         break;
+      default:;
+      }
+   }
+
+   si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL);
+   si_shader_destroy(shader);
+   free(shader);
  }
  
  static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_shader_selector *sel = (struct si_shader_selector *)cso;
-       struct si_shader *p = sel->first_variant, *c;
-       struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = {
-               [PIPE_SHADER_VERTEX] = &sctx->vs_shader,
-               [PIPE_SHADER_TESS_CTRL] = &sctx->tcs_shader,
-               [PIPE_SHADER_TESS_EVAL] = &sctx->tes_shader,
-               [PIPE_SHADER_GEOMETRY] = &sctx->gs_shader,
-               [PIPE_SHADER_FRAGMENT] = &sctx->ps_shader,
-       };
-
-       util_queue_drop_job(&sctx->screen->shader_compiler_queue, &sel->ready);
-
-       if (current_shader[sel->type]->cso == sel) {
-               current_shader[sel->type]->cso = NULL;
-               current_shader[sel->type]->current = NULL;
-       }
-
-       while (p) {
-               c = p->next_variant;
-               si_delete_shader(sctx, p);
-               p = c;
-       }
-
-       if (sel->main_shader_part)
-               si_delete_shader(sctx, sel->main_shader_part);
-       if (sel->main_shader_part_ls)
-               si_delete_shader(sctx, sel->main_shader_part_ls);
-       if (sel->main_shader_part_es)
-               si_delete_shader(sctx, sel->main_shader_part_es);
-       if (sel->main_shader_part_ngg)
-               si_delete_shader(sctx, sel->main_shader_part_ngg);
-       if (sel->gs_copy_shader)
-               si_delete_shader(sctx, sel->gs_copy_shader);
-
-       util_queue_fence_destroy(&sel->ready);
-       simple_mtx_destroy(&sel->mutex);
-       ralloc_free(sel->nir);
-       free(sel->nir_binary);
-       free(sel);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *sel = (struct si_shader_selector *)cso;
+   struct si_shader *p = sel->first_variant, *c;
+   struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = {
+      [PIPE_SHADER_VERTEX] = &sctx->vs_shader,     [PIPE_SHADER_TESS_CTRL] = &sctx->tcs_shader,
+      [PIPE_SHADER_TESS_EVAL] = &sctx->tes_shader, [PIPE_SHADER_GEOMETRY] = &sctx->gs_shader,
+      [PIPE_SHADER_FRAGMENT] = &sctx->ps_shader,
+   };
+
+   util_queue_drop_job(&sctx->screen->shader_compiler_queue, &sel->ready);
+
+   if (current_shader[sel->type]->cso == sel) {
+      current_shader[sel->type]->cso = NULL;
+      current_shader[sel->type]->current = NULL;
+   }
+
+   while (p) {
+      c = p->next_variant;
+      si_delete_shader(sctx, p);
+      p = c;
+   }
+
+   if (sel->main_shader_part)
+      si_delete_shader(sctx, sel->main_shader_part);
+   if (sel->main_shader_part_ls)
+      si_delete_shader(sctx, sel->main_shader_part_ls);
+   if (sel->main_shader_part_es)
+      si_delete_shader(sctx, sel->main_shader_part_es);
+   if (sel->main_shader_part_ngg)
+      si_delete_shader(sctx, sel->main_shader_part_ngg);
+   if (sel->gs_copy_shader)
+      si_delete_shader(sctx, sel->gs_copy_shader);
+
+   util_queue_fence_destroy(&sel->ready);
+   simple_mtx_destroy(&sel->mutex);
+   ralloc_free(sel->nir);
+   free(sel->nir_binary);
+   free(sel);
  }
  
  static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_shader_selector *sel = (struct si_shader_selector *)state;
-
-       si_shader_selector_reference(sctx, &sel, NULL);
-}
-
-static unsigned si_get_ps_input_cntl(struct si_context *sctx,
-                                    struct si_shader *vs, unsigned name,
-                                    unsigned index, unsigned interpolate)
-{
-       struct si_shader_info *vsinfo = &vs->selector->info;
-       unsigned j, offset, ps_input_cntl = 0;
-
-       if (interpolate == TGSI_INTERPOLATE_CONSTANT ||
-           (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade) ||
-           name == TGSI_SEMANTIC_PRIMID)
-               ps_input_cntl |= S_028644_FLAT_SHADE(1);
-
-       if (name == TGSI_SEMANTIC_PCOORD ||
-           (name == TGSI_SEMANTIC_TEXCOORD &&
-            sctx->sprite_coord_enable & (1 << index))) {
-               ps_input_cntl |= S_028644_PT_SPRITE_TEX(1);
-       }
-
-       for (j = 0; j < vsinfo->num_outputs; j++) {
-               if (name == vsinfo->output_semantic_name[j] &&
-                   index == vsinfo->output_semantic_index[j]) {
-                       offset = vs->info.vs_output_param_offset[j];
-
-                       if (offset <= AC_EXP_PARAM_OFFSET_31) {
-                               /* The input is loaded from parameter memory. */
-                               ps_input_cntl |= S_028644_OFFSET(offset);
-                       } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
-                               if (offset == AC_EXP_PARAM_UNDEFINED) {
-                                       /* This can happen with depth-only rendering. */
-                                       offset = 0;
-                               } else {
-                                       /* The input is a DEFAULT_VAL constant. */
-                                       assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
-                                              offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
-                                       offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
-                               }
-
-                               ps_input_cntl = S_028644_OFFSET(0x20) |
-                                               S_028644_DEFAULT_VAL(offset);
-                       }
-                       break;
-               }
-       }
-
-       if (j == vsinfo->num_outputs && name == TGSI_SEMANTIC_PRIMID)
-               /* PrimID is written after the last output when HW VS is used. */
-               ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]);
-       else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
-               /* No corresponding output found, load defaults into input.
-                * Don't set any other bits.
-                * (FLAT_SHADE=1 completely changes behavior) */
-               ps_input_cntl = S_028644_OFFSET(0x20);
-               /* D3D 9 behaviour. GL is undefined */
-               if (name == TGSI_SEMANTIC_COLOR && index == 0)
-                       ps_input_cntl |= S_028644_DEFAULT_VAL(3);
-       }
-       return ps_input_cntl;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *sel = (struct si_shader_selector *)state;
+
+   si_shader_selector_reference(sctx, &sel, NULL);
+}
+
+static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader *vs, unsigned name,
+                                     unsigned index, unsigned interpolate)
+{
+   struct si_shader_info *vsinfo = &vs->selector->info;
+   unsigned j, offset, ps_input_cntl = 0;
+
+   if (interpolate == TGSI_INTERPOLATE_CONSTANT ||
+       (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade) || name == TGSI_SEMANTIC_PRIMID)
+      ps_input_cntl |= S_028644_FLAT_SHADE(1);
+
+   if (name == TGSI_SEMANTIC_PCOORD ||
+       (name == TGSI_SEMANTIC_TEXCOORD && sctx->sprite_coord_enable & (1 << index))) {
+      ps_input_cntl |= S_028644_PT_SPRITE_TEX(1);
+   }
+
+   for (j = 0; j < vsinfo->num_outputs; j++) {
+      if (name == vsinfo->output_semantic_name[j] && index == vsinfo->output_semantic_index[j]) {
+         offset = vs->info.vs_output_param_offset[j];
+
+         if (offset <= AC_EXP_PARAM_OFFSET_31) {
+            /* The input is loaded from parameter memory. */
+            ps_input_cntl |= S_028644_OFFSET(offset);
+         } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
+            if (offset == AC_EXP_PARAM_UNDEFINED) {
+               /* This can happen with depth-only rendering. */
+               offset = 0;
+            } else {
+               /* The input is a DEFAULT_VAL constant. */
+               assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
+                      offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
+               offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
+            }
+
+            ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset);
+         }
+         break;
+      }
+   }
+
+   if (j == vsinfo->num_outputs && name == TGSI_SEMANTIC_PRIMID)
+      /* PrimID is written after the last output when HW VS is used. */
+      ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]);
+   else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
+      /* No corresponding output found, load defaults into input.
+       * Don't set any other bits.
+       * (FLAT_SHADE=1 completely changes behavior) */
+      ps_input_cntl = S_028644_OFFSET(0x20);
+      /* D3D 9 behaviour. GL is undefined */
+      if (name == TGSI_SEMANTIC_COLOR && index == 0)
+         ps_input_cntl |= S_028644_DEFAULT_VAL(3);
+   }
+   return ps_input_cntl;
  }
  
  static void si_emit_spi_map(struct si_context *sctx)
  {
-       struct si_shader *ps = sctx->ps_shader.current;
-       struct si_shader *vs = si_get_vs_state(sctx);
-       struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL;
-       unsigned i, num_interp, num_written = 0, bcol_interp[2];
-       unsigned spi_ps_input_cntl[32];
-
-       if (!ps || !ps->selector->info.num_inputs)
-               return;
-
-       num_interp = si_get_ps_num_interp(ps);
-       assert(num_interp > 0);
-
-       for (i = 0; i < psinfo->num_inputs; i++) {
-               unsigned name = psinfo->input_semantic_name[i];
-               unsigned index = psinfo->input_semantic_index[i];
-               unsigned interpolate = psinfo->input_interpolate[i];
-
-               spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, name,
-                                                           index, interpolate);
-
-               if (name == TGSI_SEMANTIC_COLOR) {
-                       assert(index < ARRAY_SIZE(bcol_interp));
-                       bcol_interp[index] = interpolate;
-               }
-       }
-
-       if (ps->key.part.ps.prolog.color_two_side) {
-               unsigned bcol = TGSI_SEMANTIC_BCOLOR;
-
-               for (i = 0; i < 2; i++) {
-                       if (!(psinfo->colors_read & (0xf << (i * 4))))
-                               continue;
-
-                       spi_ps_input_cntl[num_written++] =
-                         si_get_ps_input_cntl(sctx, vs, bcol, i, bcol_interp[i]);
-
-               }
-       }
-       assert(num_interp == num_written);
-
-       /* R_028644_SPI_PS_INPUT_CNTL_0 */
-       /* Dota 2: Only ~16% of SPI map updates set different values. */
-       /* Talos: Only ~9% of SPI map updates set different values. */
-       unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-       radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0,
-                                   spi_ps_input_cntl,
-                                   sctx->tracked_regs.spi_ps_input_cntl, num_interp);
-
-       if (initial_cdw != sctx->gfx_cs->current.cdw)
-               sctx->context_roll = true;
+   struct si_shader *ps = sctx->ps_shader.current;
+   struct si_shader *vs = si_get_vs_state(sctx);
+   struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL;
+   unsigned i, num_interp, num_written = 0, bcol_interp[2];
+   unsigned spi_ps_input_cntl[32];
+
+   if (!ps || !ps->selector->info.num_inputs)
+      return;
+
+   num_interp = si_get_ps_num_interp(ps);
+   assert(num_interp > 0);
+
+   for (i = 0; i < psinfo->num_inputs; i++) {
+      unsigned name = psinfo->input_semantic_name[i];
+      unsigned index = psinfo->input_semantic_index[i];
+      unsigned interpolate = psinfo->input_interpolate[i];
+
+      spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, name, index, interpolate);
+
+      if (name == TGSI_SEMANTIC_COLOR) {
+         assert(index < ARRAY_SIZE(bcol_interp));
+         bcol_interp[index] = interpolate;
+      }
+   }
+
+   if (ps->key.part.ps.prolog.color_two_side) {
+      unsigned bcol = TGSI_SEMANTIC_BCOLOR;
+
+      for (i = 0; i < 2; i++) {
+         if (!(psinfo->colors_read & (0xf << (i * 4))))
+            continue;
+
+         spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, bcol, i, bcol_interp[i]);
+      }
+   }
+   assert(num_interp == num_written);
+
+   /* R_028644_SPI_PS_INPUT_CNTL_0 */
+   /* Dota 2: Only ~16% of SPI map updates set different values. */
+   /* Talos: Only ~9% of SPI map updates set different values. */
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl,
+                               sctx->tracked_regs.spi_ps_input_cntl, num_interp);
+
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
  }
  
  /**
@@ -3535,169 +3278,150 @@ static void si_emit_spi_map(struct si_context *sctx)
   */
  static void si_init_config_add_vgt_flush(struct si_context *sctx)
  {
-       if (sctx->init_config_has_vgt_flush)
-               return;
-
-       /* Done by Vulkan before VGT_FLUSH. */
-       si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
-       si_pm4_cmd_add(sctx->init_config,
-                      EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-       si_pm4_cmd_end(sctx->init_config, false);
-
-       /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
-       si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
-       si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
-       si_pm4_cmd_end(sctx->init_config, false);
-       sctx->init_config_has_vgt_flush = true;
+   if (sctx->init_config_has_vgt_flush)
+      return;
+
+   /* Done by Vulkan before VGT_FLUSH. */
+   si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
+   si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+   si_pm4_cmd_end(sctx->init_config, false);
+
+   /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
+   si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
+   si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+   si_pm4_cmd_end(sctx->init_config, false);
+   sctx->init_config_has_vgt_flush = true;
  }
  
  /* Initialize state related to ESGS / GSVS ring buffers */
  static bool si_update_gs_ring_buffers(struct si_context *sctx)
  {
-       struct si_shader_selector *es =
-               sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso;
-       struct si_shader_selector *gs = sctx->gs_shader.cso;
-       struct si_pm4_state *pm4;
-
-       /* Chip constants. */
-       unsigned num_se = sctx->screen->info.max_se;
-       unsigned wave_size = 64;
-       unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
-       /* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16.
-        * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2).
-        */
-       unsigned gs_vertex_reuse = (sctx->chip_class >= GFX8 ? 32 : 16) * num_se;
-       unsigned alignment = 256 * num_se;
-       /* The maximum size is 63.999 MB per SE. */
-       unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
-
-       /* Calculate the minimum size. */
-       unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse *
-                                           wave_size, alignment);
-
-       /* These are recommended sizes, not minimum sizes. */
-       unsigned esgs_ring_size = max_gs_waves * 2 * wave_size *
-                                 es->esgs_itemsize * gs->gs_input_verts_per_prim;
-       unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size *
-                                 gs->max_gsvs_emit_size;
-
-       min_esgs_ring_size = align(min_esgs_ring_size, alignment);
-       esgs_ring_size = align(esgs_ring_size, alignment);
-       gsvs_ring_size = align(gsvs_ring_size, alignment);
-
-       esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
-       gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
-
-       /* Some rings don't have to be allocated if shaders don't use them.
-        * (e.g. no varyings between ES and GS or GS and VS)
-        *
-        * GFX9 doesn't have the ESGS ring.
-        */
-       bool update_esgs = sctx->chip_class <= GFX8 &&
-                          esgs_ring_size &&
-                          (!sctx->esgs_ring ||
-                           sctx->esgs_ring->width0 < esgs_ring_size);
-       bool update_gsvs = gsvs_ring_size &&
-                          (!sctx->gsvs_ring ||
-                           sctx->gsvs_ring->width0 < gsvs_ring_size);
-
-       if (!update_esgs && !update_gsvs)
-               return true;
-
-       if (update_esgs) {
-               pipe_resource_reference(&sctx->esgs_ring, NULL);
-               sctx->esgs_ring =
-                       pipe_aligned_buffer_create(sctx->b.screen,
-                                                  SI_RESOURCE_FLAG_UNMAPPABLE,
-                                                  PIPE_USAGE_DEFAULT,
-                                                  esgs_ring_size,
-                                                  sctx->screen->info.pte_fragment_size);
-               if (!sctx->esgs_ring)
-                       return false;
-       }
-
-       if (update_gsvs) {
-               pipe_resource_reference(&sctx->gsvs_ring, NULL);
-               sctx->gsvs_ring =
-                       pipe_aligned_buffer_create(sctx->b.screen,
-                                                  SI_RESOURCE_FLAG_UNMAPPABLE,
-                                                  PIPE_USAGE_DEFAULT,
-                                                  gsvs_ring_size,
-                                                  sctx->screen->info.pte_fragment_size);
-               if (!sctx->gsvs_ring)
-                       return false;
-       }
-
-       /* Create the "init_config_gs_rings" state. */
-       pm4 = CALLOC_STRUCT(si_pm4_state);
-       if (!pm4)
-               return false;
-
-       if (sctx->chip_class >= GFX7) {
-               if (sctx->esgs_ring) {
-                       assert(sctx->chip_class <= GFX8);
-                       si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE,
-                                      sctx->esgs_ring->width0 / 256);
-               }
-               if (sctx->gsvs_ring)
-                       si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE,
-                                      sctx->gsvs_ring->width0 / 256);
-       } else {
-               if (sctx->esgs_ring)
-                       si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE,
-                                      sctx->esgs_ring->width0 / 256);
-               if (sctx->gsvs_ring)
-                       si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE,
-                                      sctx->gsvs_ring->width0 / 256);
-       }
-
-       /* Set the state. */
-       if (sctx->init_config_gs_rings)
-               si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
-       sctx->init_config_gs_rings = pm4;
-
-       if (!sctx->init_config_has_vgt_flush) {
-               si_init_config_add_vgt_flush(sctx);
-               si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
-       }
-
-       /* Flush the context to re-emit both init_config states. */
-       sctx->initial_gfx_cs_size = 0; /* force flush */
-       si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-
-       /* Set ring bindings. */
-       if (sctx->esgs_ring) {
-               assert(sctx->chip_class <= GFX8);
-               si_set_ring_buffer(sctx, SI_ES_RING_ESGS,
-                                  sctx->esgs_ring, 0, sctx->esgs_ring->width0,
-                                  true, true, 4, 64, 0);
-               si_set_ring_buffer(sctx, SI_GS_RING_ESGS,
-                                  sctx->esgs_ring, 0, sctx->esgs_ring->width0,
-                                  false, false, 0, 0, 0);
-       }
-       if (sctx->gsvs_ring) {
-               si_set_ring_buffer(sctx, SI_RING_GSVS,
-                                  sctx->gsvs_ring, 0, sctx->gsvs_ring->width0,
-                                  false, false, 0, 0, 0);
-       }
-
-       return true;
+   struct si_shader_selector *es =
+      sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso;
+   struct si_shader_selector *gs = sctx->gs_shader.cso;
+   struct si_pm4_state *pm4;
+
+   /* Chip constants. */
+   unsigned num_se = sctx->screen->info.max_se;
+   unsigned wave_size = 64;
+   unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
+   /* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16.
+    * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2).
+    */
+   unsigned gs_vertex_reuse = (sctx->chip_class >= GFX8 ? 32 : 16) * num_se;
+   unsigned alignment = 256 * num_se;
+   /* The maximum size is 63.999 MB per SE. */
+   unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
+
+   /* Calculate the minimum size. */
+   unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse * wave_size, alignment);
+
+   /* These are recommended sizes, not minimum sizes. */
+   unsigned esgs_ring_size =
+      max_gs_waves * 2 * wave_size * es->esgs_itemsize * gs->gs_input_verts_per_prim;
+   unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs->max_gsvs_emit_size;
+
+   min_esgs_ring_size = align(min_esgs_ring_size, alignment);
+   esgs_ring_size = align(esgs_ring_size, alignment);
+   gsvs_ring_size = align(gsvs_ring_size, alignment);
+
+   esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
+   gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
+
+   /* Some rings don't have to be allocated if shaders don't use them.
+    * (e.g. no varyings between ES and GS or GS and VS)
+    *
+    * GFX9 doesn't have the ESGS ring.
+    */
+   bool update_esgs = sctx->chip_class <= GFX8 && esgs_ring_size &&
+                      (!sctx->esgs_ring || sctx->esgs_ring->width0 < esgs_ring_size);
+   bool update_gsvs =
+      gsvs_ring_size && (!sctx->gsvs_ring || sctx->gsvs_ring->width0 < gsvs_ring_size);
+
+   if (!update_esgs && !update_gsvs)
+      return true;
+
+   if (update_esgs) {
+      pipe_resource_reference(&sctx->esgs_ring, NULL);
+      sctx->esgs_ring =
+         pipe_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+                                    esgs_ring_size, sctx->screen->info.pte_fragment_size);
+      if (!sctx->esgs_ring)
+         return false;
+   }
+
+   if (update_gsvs) {
+      pipe_resource_reference(&sctx->gsvs_ring, NULL);
+      sctx->gsvs_ring =
+         pipe_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+                                    gsvs_ring_size, sctx->screen->info.pte_fragment_size);
+      if (!sctx->gsvs_ring)
+         return false;
+   }
+
+   /* Create the "init_config_gs_rings" state. */
+   pm4 = CALLOC_STRUCT(si_pm4_state);
+   if (!pm4)
+      return false;
+
+   if (sctx->chip_class >= GFX7) {
+      if (sctx->esgs_ring) {
+         assert(sctx->chip_class <= GFX8);
+         si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256);
+      }
+      if (sctx->gsvs_ring)
+         si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256);
+   } else {
+      if (sctx->esgs_ring)
+         si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256);
+      if (sctx->gsvs_ring)
+         si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256);
+   }
+
+   /* Set the state. */
+   if (sctx->init_config_gs_rings)
+      si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
+   sctx->init_config_gs_rings = pm4;
+
+   if (!sctx->init_config_has_vgt_flush) {
+      si_init_config_add_vgt_flush(sctx);
+      si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
+   }
+
+   /* Flush the context to re-emit both init_config states. */
+   sctx->initial_gfx_cs_size = 0; /* force flush */
+   si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+
+   /* Set ring bindings. */
+   if (sctx->esgs_ring) {
+      assert(sctx->chip_class <= GFX8);
+      si_set_ring_buffer(sctx, SI_ES_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, true,
+                         true, 4, 64, 0);
+      si_set_ring_buffer(sctx, SI_GS_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, false,
+                         false, 0, 0, 0);
+   }
+   if (sctx->gsvs_ring) {
+      si_set_ring_buffer(sctx, SI_RING_GSVS, sctx->gsvs_ring, 0, sctx->gsvs_ring->width0, false,
+                         false, 0, 0, 0);
+   }
+
+   return true;
  }
  
  static void si_shader_lock(struct si_shader *shader)
  {
-       simple_mtx_lock(&shader->selector->mutex);
-       if (shader->previous_stage_sel) {
-               assert(shader->previous_stage_sel != shader->selector);
-               simple_mtx_lock(&shader->previous_stage_sel->mutex);
-       }
+   simple_mtx_lock(&shader->selector->mutex);
+   if (shader->previous_stage_sel) {
+      assert(shader->previous_stage_sel != shader->selector);
+      simple_mtx_lock(&shader->previous_stage_sel->mutex);
+   }
  }
  
  static void si_shader_unlock(struct si_shader *shader)
  {
-       if (shader->previous_stage_sel)
-               simple_mtx_unlock(&shader->previous_stage_sel->mutex);
-       simple_mtx_unlock(&shader->selector->mutex);
+   if (shader->previous_stage_sel)
+      simple_mtx_unlock(&shader->previous_stage_sel->mutex);
+   simple_mtx_unlock(&shader->selector->mutex);
  }
  
  /**
@@ -3705,578 +3429,545 @@ static void si_shader_unlock(struct si_shader *shader)
   *          0 if not
   *          < 0 if there was a failure
   */
-static int si_update_scratch_buffer(struct si_context *sctx,
-                                   struct si_shader *shader)
+static int si_update_scratch_buffer(struct si_context *sctx, struct si_shader *shader)
  {
-       uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
+   uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
  
-       if (!shader)
-               return 0;
+   if (!shader)
+      return 0;
  
-       /* This shader doesn't need a scratch buffer */
-       if (shader->config.scratch_bytes_per_wave == 0)
-               return 0;
+   /* This shader doesn't need a scratch buffer */
+   if (shader->config.scratch_bytes_per_wave == 0)
+      return 0;
  
-       /* Prevent race conditions when updating:
-        * - si_shader::scratch_bo
-        * - si_shader::binary::code
-        * - si_shader::previous_stage::binary::code.
-        */
-       si_shader_lock(shader);
+   /* Prevent race conditions when updating:
+    * - si_shader::scratch_bo
+    * - si_shader::binary::code
+    * - si_shader::previous_stage::binary::code.
+    */
+   si_shader_lock(shader);
  
-       /* This shader is already configured to use the current
-        * scratch buffer. */
-       if (shader->scratch_bo == sctx->scratch_buffer) {
-               si_shader_unlock(shader);
-               return 0;
-       }
+   /* This shader is already configured to use the current
+    * scratch buffer. */
+   if (shader->scratch_bo == sctx->scratch_buffer) {
+      si_shader_unlock(shader);
+      return 0;
+   }
  
-       assert(sctx->scratch_buffer);
+   assert(sctx->scratch_buffer);
  
-       /* Replace the shader bo with a new bo that has the relocs applied. */
-       if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) {
-               si_shader_unlock(shader);
-               return -1;
-       }
+   /* Replace the shader bo with a new bo that has the relocs applied. */
+   if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) {
+      si_shader_unlock(shader);
+      return -1;
+   }
  
-       /* Update the shader state to use the new shader bo. */
-       si_shader_init_pm4_state(sctx->screen, shader);
+   /* Update the shader state to use the new shader bo. */
+   si_shader_init_pm4_state(sctx->screen, shader);
  
-       si_resource_reference(&shader->scratch_bo, sctx->scratch_buffer);
+   si_resource_reference(&shader->scratch_bo, sctx->scratch_buffer);
  
-       si_shader_unlock(shader);
-       return 1;
+   si_shader_unlock(shader);
+   return 1;
  }
  
  static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader)
  {
-       return shader ? shader->config.scratch_bytes_per_wave : 0;
+   return shader ? shader->config.scratch_bytes_per_wave : 0;
  }
  
  static struct si_shader *si_get_tcs_current(struct si_context *sctx)
  {
-       if (!sctx->tes_shader.cso)
-               return NULL; /* tessellation disabled */
+   if (!sctx->tes_shader.cso)
+      return NULL; /* tessellation disabled */
  
-       return sctx->tcs_shader.cso ? sctx->tcs_shader.current :
-                                     sctx->fixed_func_tcs_shader.current;
+   return sctx->tcs_shader.cso ? sctx->tcs_shader.current : sctx->fixed_func_tcs_shader.current;
  }
  
  static bool si_update_scratch_relocs(struct si_context *sctx)
  {
-       struct si_shader *tcs = si_get_tcs_current(sctx);
-       int r;
-
-       /* Update the shaders, so that they are using the latest scratch.
-        * The scratch buffer may have been changed since these shaders were
-        * last used, so we still need to try to update them, even if they
-        * require scratch buffers smaller than the current size.
-        */
-       r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
-       if (r < 0)
-               return false;
-       if (r == 1)
-               si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
-
-       r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
-       if (r < 0)
-               return false;
-       if (r == 1)
-               si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
-
-       r = si_update_scratch_buffer(sctx, tcs);
-       if (r < 0)
-               return false;
-       if (r == 1)
-               si_pm4_bind_state(sctx, hs, tcs->pm4);
-
-       /* VS can be bound as LS, ES, or VS. */
-       r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
-       if (r < 0)
-               return false;
-       if (r == 1) {
-               if (sctx->vs_shader.current->key.as_ls)
-                       si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
-               else if (sctx->vs_shader.current->key.as_es)
-                       si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
-               else if (sctx->vs_shader.current->key.as_ngg)
-                       si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4);
-               else
-                       si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
-       }
-
-       /* TES can be bound as ES or VS. */
-       r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
-       if (r < 0)
-               return false;
-       if (r == 1) {
-               if (sctx->tes_shader.current->key.as_es)
-                       si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
-               else if (sctx->tes_shader.current->key.as_ngg)
-                       si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4);
-               else
-                       si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
-       }
-
-       return true;
+   struct si_shader *tcs = si_get_tcs_current(sctx);
+   int r;
+
+   /* Update the shaders, so that they are using the latest scratch.
+    * The scratch buffer may have been changed since these shaders were
+    * last used, so we still need to try to update them, even if they
+    * require scratch buffers smaller than the current size.
+    */
+   r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
+   if (r < 0)
+      return false;
+   if (r == 1)
+      si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
+
+   r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
+   if (r < 0)
+      return false;
+   if (r == 1)
+      si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
+
+   r = si_update_scratch_buffer(sctx, tcs);
+   if (r < 0)
+      return false;
+   if (r == 1)
+      si_pm4_bind_state(sctx, hs, tcs->pm4);
+
+   /* VS can be bound as LS, ES, or VS. */
+   r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
+   if (r < 0)
+      return false;
+   if (r == 1) {
+      if (sctx->vs_shader.current->key.as_ls)
+         si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
+      else if (sctx->vs_shader.current->key.as_es)
+         si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
+      else if (sctx->vs_shader.current->key.as_ngg)
+         si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4);
+      else
+         si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
+   }
+
+   /* TES can be bound as ES or VS. */
+   r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
+   if (r < 0)
+      return false;
+   if (r == 1) {
+      if (sctx->tes_shader.current->key.as_es)
+         si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
+      else if (sctx->tes_shader.current->key.as_ngg)
+         si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4);
+      else
+         si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
+   }
+
+   return true;
  }
  
  static bool si_update_spi_tmpring_size(struct si_context *sctx)
  {
-       /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer.
-        * There are 2 cases to handle:
-        *
-        * - If the current needed size is less than the maximum seen size,
-        *   use the maximum seen size, so that WAVESIZE remains the same.
-        *
-        * - If the current needed size is greater than the maximum seen size,
-        *   the scratch buffer is reallocated, so we can increase WAVESIZE.
-        *
-        * Shaders that set SCRATCH_EN=0 don't allocate scratch space.
-        * Otherwise, the number of waves that can use scratch is
-        * SPI_TMPRING_SIZE.WAVES.
-        */
-       unsigned bytes = 0;
-
-       bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current));
-       bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current));
-       bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current));
-
-       if (sctx->tes_shader.cso) {
-               bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current));
-               bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(si_get_tcs_current(sctx)));
-       }
-
-       sctx->max_seen_scratch_bytes_per_wave =
-               MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes);
-
-       unsigned scratch_needed_size =
-               sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves;
-       unsigned spi_tmpring_size;
-
-       if (scratch_needed_size > 0) {
-               if (!sctx->scratch_buffer ||
-                   scratch_needed_size > sctx->scratch_buffer->b.b.width0) {
-                       /* Create a bigger scratch buffer */
-                       si_resource_reference(&sctx->scratch_buffer, NULL);
-
-                       sctx->scratch_buffer =
-                               si_aligned_buffer_create(&sctx->screen->b,
-                                                        SI_RESOURCE_FLAG_UNMAPPABLE,
-                                                        PIPE_USAGE_DEFAULT,
-                                                        scratch_needed_size,
-                                                        sctx->screen->info.pte_fragment_size);
-                       if (!sctx->scratch_buffer)
-                               return false;
-
-                       si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
-                       si_context_add_resource_size(sctx,
-                                                    &sctx->scratch_buffer->b.b);
-               }
-
-               if (!si_update_scratch_relocs(sctx))
-                       return false;
-       }
-
-       /* The LLVM shader backend should be reporting aligned scratch_sizes. */
-       assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
-               "scratch size should already be aligned correctly.");
-
-       spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
-                          S_0286E8_WAVESIZE(sctx->max_seen_scratch_bytes_per_wave >> 10);
-       if (spi_tmpring_size != sctx->spi_tmpring_size) {
-               sctx->spi_tmpring_size = spi_tmpring_size;
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
-       }
-       return true;
+   /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer.
+    * There are 2 cases to handle:
+    *
+    * - If the current needed size is less than the maximum seen size,
+    *   use the maximum seen size, so that WAVESIZE remains the same.
+    *
+    * - If the current needed size is greater than the maximum seen size,
+    *   the scratch buffer is reallocated, so we can increase WAVESIZE.
+    *
+    * Shaders that set SCRATCH_EN=0 don't allocate scratch space.
+    * Otherwise, the number of waves that can use scratch is
+    * SPI_TMPRING_SIZE.WAVES.
+    */
+   unsigned bytes = 0;
+
+   bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current));
+   bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current));
+   bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current));
+
+   if (sctx->tes_shader.cso) {
+      bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current));
+      bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(si_get_tcs_current(sctx)));
+   }
+
+   sctx->max_seen_scratch_bytes_per_wave = MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes);
+
+   unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves;
+   unsigned spi_tmpring_size;
+
+   if (scratch_needed_size > 0) {
+      if (!sctx->scratch_buffer || scratch_needed_size > sctx->scratch_buffer->b.b.width0) {
+         /* Create a bigger scratch buffer */
+         si_resource_reference(&sctx->scratch_buffer, NULL);
+
+         sctx->scratch_buffer = si_aligned_buffer_create(
+            &sctx->screen->b, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, scratch_needed_size,
+            sctx->screen->info.pte_fragment_size);
+         if (!sctx->scratch_buffer)
+            return false;
+
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
+         si_context_add_resource_size(sctx, &sctx->scratch_buffer->b.b);
+      }
+
+      if (!si_update_scratch_relocs(sctx))
+         return false;
+   }
+
+   /* The LLVM shader backend should be reporting aligned scratch_sizes. */
+   assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
+          "scratch size should already be aligned correctly.");
+
+   spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
+                      S_0286E8_WAVESIZE(sctx->max_seen_scratch_bytes_per_wave >> 10);
+   if (spi_tmpring_size != sctx->spi_tmpring_size) {
+      sctx->spi_tmpring_size = spi_tmpring_size;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
+   }
+   return true;
  }
  
  static void si_init_tess_factor_ring(struct si_context *sctx)
  {
-       assert(!sctx->tess_rings);
-       assert(((sctx->screen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0);
-
-       /* The address must be aligned to 2^19, because the shader only
-        * receives the high 13 bits.
-        */
-       sctx->tess_rings = pipe_aligned_buffer_create(sctx->b.screen,
-                                                   SI_RESOURCE_FLAG_32BIT,
-                                                   PIPE_USAGE_DEFAULT,
-                                                   sctx->screen->tess_offchip_ring_size +
-                                                   sctx->screen->tess_factor_ring_size,
-                                                   1 << 19);
-       if (!sctx->tess_rings)
-               return;
-
-       si_init_config_add_vgt_flush(sctx);
-
-       si_pm4_add_bo(sctx->init_config, si_resource(sctx->tess_rings),
-                     RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS);
-
-       uint64_t factor_va = si_resource(sctx->tess_rings)->gpu_address +
-                            sctx->screen->tess_offchip_ring_size;
-
-       /* Append these registers to the init config state. */
-       if (sctx->chip_class >= GFX7) {
-               si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE,
-                              S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4));
-               si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE,
-                              factor_va >> 8);
-               if (sctx->chip_class >= GFX10)
-                       si_pm4_set_reg(sctx->init_config, R_030984_VGT_TF_MEMORY_BASE_HI_UMD,
-                                      S_030984_BASE_HI(factor_va >> 40));
-               else if (sctx->chip_class == GFX9)
-                       si_pm4_set_reg(sctx->init_config, R_030944_VGT_TF_MEMORY_BASE_HI,
-                                      S_030944_BASE_HI(factor_va >> 40));
-               si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM,
-                              sctx->screen->vgt_hs_offchip_param);
-       } else {
-               si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE,
-                              S_008988_SIZE(sctx->screen->tess_factor_ring_size / 4));
-               si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE,
-                              factor_va >> 8);
-               si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM,
-                              sctx->screen->vgt_hs_offchip_param);
-       }
-
-       /* Flush the context to re-emit the init_config state.
-        * This is done only once in a lifetime of a context.
-        */
-       si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
-       sctx->initial_gfx_cs_size = 0; /* force flush */
-       si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+   assert(!sctx->tess_rings);
+   assert(((sctx->screen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0);
+
+   /* The address must be aligned to 2^19, because the shader only
+    * receives the high 13 bits.
+    */
+   sctx->tess_rings = pipe_aligned_buffer_create(
+      sctx->b.screen, SI_RESOURCE_FLAG_32BIT, PIPE_USAGE_DEFAULT,
+      sctx->screen->tess_offchip_ring_size + sctx->screen->tess_factor_ring_size, 1 << 19);
+   if (!sctx->tess_rings)
+      return;
+
+   si_init_config_add_vgt_flush(sctx);
+
+   si_pm4_add_bo(sctx->init_config, si_resource(sctx->tess_rings), RADEON_USAGE_READWRITE,
+                 RADEON_PRIO_SHADER_RINGS);
+
+   uint64_t factor_va =
+      si_resource(sctx->tess_rings)->gpu_address + sctx->screen->tess_offchip_ring_size;
+
+   /* Append these registers to the init config state. */
+   if (sctx->chip_class >= GFX7) {
+      si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE,
+                     S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4));
+      si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8);
+      if (sctx->chip_class >= GFX10)
+         si_pm4_set_reg(sctx->init_config, R_030984_VGT_TF_MEMORY_BASE_HI_UMD,
+                        S_030984_BASE_HI(factor_va >> 40));
+      else if (sctx->chip_class == GFX9)
+         si_pm4_set_reg(sctx->init_config, R_030944_VGT_TF_MEMORY_BASE_HI,
+                        S_030944_BASE_HI(factor_va >> 40));
+      si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM,
+                     sctx->screen->vgt_hs_offchip_param);
+   } else {
+      si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE,
+                     S_008988_SIZE(sctx->screen->tess_factor_ring_size / 4));
+      si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE, factor_va >> 8);
+      si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM,
+                     sctx->screen->vgt_hs_offchip_param);
+   }
+
+   /* Flush the context to re-emit the init_config state.
+    * This is done only once in a lifetime of a context.
+    */
+   si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
+   sctx->initial_gfx_cs_size = 0; /* force flush */
+   si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
  }
  
  static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
-                                                      union si_vgt_stages_key key)
+                                                       union si_vgt_stages_key key)
  {
-       struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
-       uint32_t stages = 0;
-
-       if (key.u.tess) {
-               stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
-                         S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
-
-               if (key.u.gs)
-                       stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
-                                 S_028B54_GS_EN(1);
-               else if (key.u.ngg)
-                       stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
-               else
-                       stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
-       } else if (key.u.gs) {
-               stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
-                         S_028B54_GS_EN(1);
-       } else if (key.u.ngg) {
-               stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
-       }
-
-       if (key.u.ngg) {
-               stages |= S_028B54_PRIMGEN_EN(1) |
-                         S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) |
-                         S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
-                         S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough);
-       } else if (key.u.gs)
-               stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
-
-       if (screen->info.chip_class >= GFX9)
-               stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
-
-       if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) {
-               stages |= S_028B54_HS_W32_EN(1) |
-                         S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */
-                         S_028B54_VS_W32_EN(1);
-       }
-
-       si_pm4_set_reg(pm4, R_028B54_VGT_SHADER_STAGES_EN, stages);
-       return pm4;
+   struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
+   uint32_t stages = 0;
+
+   if (key.u.tess) {
+      stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
+
+      if (key.u.gs)
+         stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | S_028B54_GS_EN(1);
+      else if (key.u.ngg)
+         stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
+      else
+         stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
+   } else if (key.u.gs) {
+      stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | S_028B54_GS_EN(1);
+   } else if (key.u.ngg) {
+      stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
+   }
+
+   if (key.u.ngg) {
+      stages |= S_028B54_PRIMGEN_EN(1) | S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) |
+                S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
+                S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough);
+   } else if (key.u.gs)
+      stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+
+   if (screen->info.chip_class >= GFX9)
+      stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
+
+   if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) {
+      stages |= S_028B54_HS_W32_EN(1) |
+                S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */
+                S_028B54_VS_W32_EN(1);
+   }
+
+   si_pm4_set_reg(pm4, R_028B54_VGT_SHADER_STAGES_EN, stages);
+   return pm4;
  }
  
-static void si_update_vgt_shader_config(struct si_context *sctx,
-                                       union si_vgt_stages_key key)
+static void si_update_vgt_shader_config(struct si_context *sctx, union si_vgt_stages_key key)
  {
-       struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index];
+   struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index];
  
-       if (unlikely(!*pm4))
-               *pm4 = si_build_vgt_shader_config(sctx->screen, key);
-       si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
+   if (unlikely(!*pm4))
+      *pm4 = si_build_vgt_shader_config(sctx->screen, key);
+   si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
  }
  
  bool si_update_shaders(struct si_context *sctx)
  {
-       struct pipe_context *ctx = (struct pipe_context*)sctx;
-       struct si_compiler_ctx_state compiler_state;
-       struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-       struct si_shader *old_vs = si_get_vs_state(sctx);
-       bool old_clip_disable = old_vs ? old_vs->key.opt.clip_disable : false;
-       struct si_shader *old_ps = sctx->ps_shader.current;
-       union si_vgt_stages_key key;
-       unsigned old_spi_shader_col_format =
-               old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0;
-       int r;
-
-       if (!sctx->compiler.passes)
-               si_init_compiler(sctx->screen, &sctx->compiler);
-
-       compiler_state.compiler = &sctx->compiler;
-       compiler_state.debug = sctx->debug;
-       compiler_state.is_debug_context = sctx->is_debug;
-
-       key.index = 0;
-
-       if (sctx->tes_shader.cso)
-               key.u.tess = 1;
-       if (sctx->gs_shader.cso)
-               key.u.gs = 1;
-
-       if (sctx->ngg) {
-               key.u.ngg = 1;
-               key.u.streamout = !!si_get_vs(sctx)->cso->so.num_outputs;
-       }
-
-       /* Update TCS and TES. */
-       if (sctx->tes_shader.cso) {
-               if (!sctx->tess_rings) {
-                       si_init_tess_factor_ring(sctx);
-                       if (!sctx->tess_rings)
-                               return false;
-               }
-
-               if (sctx->tcs_shader.cso) {
-                       r = si_shader_select(ctx, &sctx->tcs_shader, key,
-                                            &compiler_state);
-                       if (r)
-                               return false;
-                       si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
-               } else {
-                       if (!sctx->fixed_func_tcs_shader.cso) {
-                               sctx->fixed_func_tcs_shader.cso =
-                                       si_create_fixed_func_tcs(sctx);
-                               if (!sctx->fixed_func_tcs_shader.cso)
-                                       return false;
-                       }
-
-                       r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader,
-                                            key, &compiler_state);
-                       if (r)
-                               return false;
-                       si_pm4_bind_state(sctx, hs,
-                                         sctx->fixed_func_tcs_shader.current->pm4);
-               }
-
-               if (!sctx->gs_shader.cso || sctx->chip_class <= GFX8) {
-                       r = si_shader_select(ctx, &sctx->tes_shader, key, &compiler_state);
-                       if (r)
-                               return false;
-
-                       if (sctx->gs_shader.cso) {
-                               /* TES as ES */
-                               assert(sctx->chip_class <= GFX8);
-                               si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
-                       } else if (key.u.ngg) {
-                               si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4);
-                       } else {
-                               si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
-                       }
-               }
-       } else {
-               if (sctx->chip_class <= GFX8)
-                       si_pm4_bind_state(sctx, ls, NULL);
-               si_pm4_bind_state(sctx, hs, NULL);
-       }
-
-       /* Update GS. */
-       if (sctx->gs_shader.cso) {
-               r = si_shader_select(ctx, &sctx->gs_shader, key, &compiler_state);
-               if (r)
-                       return false;
-               si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
-               if (!key.u.ngg) {
-                       si_pm4_bind_state(sctx, vs, sctx->gs_shader.cso->gs_copy_shader->pm4);
-
-                       if (!si_update_gs_ring_buffers(sctx))
-                               return false;
-               } else {
-                       si_pm4_bind_state(sctx, vs, NULL);
-               }
-       } else {
-               if (!key.u.ngg) {
-                       si_pm4_bind_state(sctx, gs, NULL);
-                       if (sctx->chip_class <= GFX8)
-                               si_pm4_bind_state(sctx, es, NULL);
-               }
-       }
-
-       /* Update VS. */
-       if ((!key.u.tess && !key.u.gs) || sctx->chip_class <= GFX8) {
-               r = si_shader_select(ctx, &sctx->vs_shader, key, &compiler_state);
-               if (r)
-                       return false;
-
-               if (!key.u.tess && !key.u.gs) {
-                       if (key.u.ngg) {
-                               si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4);
-                               si_pm4_bind_state(sctx, vs, NULL);
-                       } else {
-                               si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
-                       }
-               } else if (sctx->tes_shader.cso) {
-                       si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
-               } else {
-                       assert(sctx->gs_shader.cso);
-                       si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
-               }
-       }
-
-       /* This must be done after the shader variant is selected. */
-       if (sctx->ngg) {
-               struct si_shader *vs = si_get_vs(sctx)->current;
-
-               key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs);
-               key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling &
-                                             SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
-       }
-
-       si_update_vgt_shader_config(sctx, key);
-
-       if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable)
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
-
-       if (sctx->ps_shader.cso) {
-               unsigned db_shader_control;
-
-               r = si_shader_select(ctx, &sctx->ps_shader, key, &compiler_state);
-               if (r)
-                       return false;
-               si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
-
-               db_shader_control =
-                       sctx->ps_shader.cso->db_shader_control |
-                       S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS);
-
-               if (si_pm4_state_changed(sctx, ps) ||
-                   si_pm4_state_changed(sctx, vs) ||
-                   (key.u.ngg && si_pm4_state_changed(sctx, gs)) ||
-                   sctx->sprite_coord_enable != rs->sprite_coord_enable ||
-                   sctx->flatshade != rs->flatshade) {
-                       sctx->sprite_coord_enable = rs->sprite_coord_enable;
-                       sctx->flatshade = rs->flatshade;
-                       si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
-               }
-
-               if (sctx->screen->info.rbplus_allowed &&
-                   si_pm4_state_changed(sctx, ps) &&
-                   (!old_ps ||
-                    old_spi_shader_col_format !=
-                    sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format))
-                       si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
-
-               if (sctx->ps_db_shader_control != db_shader_control) {
-                       sctx->ps_db_shader_control = db_shader_control;
-                       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-                       if (sctx->screen->dpbb_allowed)
-                               si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-               }
-
-               if (sctx->smoothing_enabled != sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing) {
-                       sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing;
-                       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-
-                       if (sctx->chip_class == GFX6)
-                               si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-
-                       if (sctx->framebuffer.nr_samples <= 1)
-                               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
-               }
-       }
-
-       if (si_pm4_state_enabled_and_changed(sctx, ls) ||
-           si_pm4_state_enabled_and_changed(sctx, hs) ||
-           si_pm4_state_enabled_and_changed(sctx, es) ||
-           si_pm4_state_enabled_and_changed(sctx, gs) ||
-           si_pm4_state_enabled_and_changed(sctx, vs) ||
-           si_pm4_state_enabled_and_changed(sctx, ps)) {
-               if (!si_update_spi_tmpring_size(sctx))
-                       return false;
-       }
-
-       if (sctx->chip_class >= GFX7) {
-               if (si_pm4_state_enabled_and_changed(sctx, ls))
-                       sctx->prefetch_L2_mask |= SI_PREFETCH_LS;
-               else if (!sctx->queued.named.ls)
-                       sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS;
-
-               if (si_pm4_state_enabled_and_changed(sctx, hs))
-                       sctx->prefetch_L2_mask |= SI_PREFETCH_HS;
-               else if (!sctx->queued.named.hs)
-                       sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS;
-
-               if (si_pm4_state_enabled_and_changed(sctx, es))
-                       sctx->prefetch_L2_mask |= SI_PREFETCH_ES;
-               else if (!sctx->queued.named.es)
-                       sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES;
-
-               if (si_pm4_state_enabled_and_changed(sctx, gs))
-                       sctx->prefetch_L2_mask |= SI_PREFETCH_GS;
-               else if (!sctx->queued.named.gs)
-                       sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS;
-
-               if (si_pm4_state_enabled_and_changed(sctx, vs))
-                       sctx->prefetch_L2_mask |= SI_PREFETCH_VS;
-               else if (!sctx->queued.named.vs)
-                       sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS;
-
-               if (si_pm4_state_enabled_and_changed(sctx, ps))
-                       sctx->prefetch_L2_mask |= SI_PREFETCH_PS;
-               else if (!sctx->queued.named.ps)
-                       sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS;
-       }
-
-       sctx->do_update_shaders = false;
-       return true;
+   struct pipe_context *ctx = (struct pipe_context *)sctx;
+   struct si_compiler_ctx_state compiler_state;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   struct si_shader *old_vs = si_get_vs_state(sctx);
+   bool old_clip_disable = old_vs ? old_vs->key.opt.clip_disable : false;
+   struct si_shader *old_ps = sctx->ps_shader.current;
+   union si_vgt_stages_key key;
+   unsigned old_spi_shader_col_format =
+      old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0;
+   int r;
+
+   if (!sctx->compiler.passes)
+      si_init_compiler(sctx->screen, &sctx->compiler);
+
+   compiler_state.compiler = &sctx->compiler;
+   compiler_state.debug = sctx->debug;
+   compiler_state.is_debug_context = sctx->is_debug;
+
+   key.index = 0;
+
+   if (sctx->tes_shader.cso)
+      key.u.tess = 1;
+   if (sctx->gs_shader.cso)
+      key.u.gs = 1;
+
+   if (sctx->ngg) {
+      key.u.ngg = 1;
+      key.u.streamout = !!si_get_vs(sctx)->cso->so.num_outputs;
+   }
+
+   /* Update TCS and TES. */
+   if (sctx->tes_shader.cso) {
+      if (!sctx->tess_rings) {
+         si_init_tess_factor_ring(sctx);
+         if (!sctx->tess_rings)
+            return false;
+      }
+
+      if (sctx->tcs_shader.cso) {
+         r = si_shader_select(ctx, &sctx->tcs_shader, key, &compiler_state);
+         if (r)
+            return false;
+         si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
+      } else {
+         if (!sctx->fixed_func_tcs_shader.cso) {
+            sctx->fixed_func_tcs_shader.cso = si_create_fixed_func_tcs(sctx);
+            if (!sctx->fixed_func_tcs_shader.cso)
+               return false;
+         }
+
+         r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader, key, &compiler_state);
+         if (r)
+            return false;
+         si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current->pm4);
+      }
+
+      if (!sctx->gs_shader.cso || sctx->chip_class <= GFX8) {
+         r = si_shader_select(ctx, &sctx->tes_shader, key, &compiler_state);
+         if (r)
+            return false;
+
+         if (sctx->gs_shader.cso) {
+            /* TES as ES */
+            assert(sctx->chip_class <= GFX8);
+            si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
+         } else if (key.u.ngg) {
+            si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4);
+         } else {
+            si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
+         }
+      }
+   } else {
+      if (sctx->chip_class <= GFX8)
+         si_pm4_bind_state(sctx, ls, NULL);
+      si_pm4_bind_state(sctx, hs, NULL);
+   }
+
+   /* Update GS. */
+   if (sctx->gs_shader.cso) {
+      r = si_shader_select(ctx, &sctx->gs_shader, key, &compiler_state);
+      if (r)
+         return false;
+      si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
+      if (!key.u.ngg) {
+         si_pm4_bind_state(sctx, vs, sctx->gs_shader.cso->gs_copy_shader->pm4);
+
+         if (!si_update_gs_ring_buffers(sctx))
+            return false;
+      } else {
+         si_pm4_bind_state(sctx, vs, NULL);
+      }
+   } else {
+      if (!key.u.ngg) {
+         si_pm4_bind_state(sctx, gs, NULL);
+         if (sctx->chip_class <= GFX8)
+            si_pm4_bind_state(sctx, es, NULL);
+      }
+   }
+
+   /* Update VS. */
+   if ((!key.u.tess && !key.u.gs) || sctx->chip_class <= GFX8) {
+      r = si_shader_select(ctx, &sctx->vs_shader, key, &compiler_state);
+      if (r)
+         return false;
+
+      if (!key.u.tess && !key.u.gs) {
+         if (key.u.ngg) {
+            si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4);
+            si_pm4_bind_state(sctx, vs, NULL);
+         } else {
+            si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
+         }
+      } else if (sctx->tes_shader.cso) {
+         si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
+      } else {
+         assert(sctx->gs_shader.cso);
+         si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
+      }
+   }
+
+   /* This must be done after the shader variant is selected. */
+   if (sctx->ngg) {
+      struct si_shader *vs = si_get_vs(sctx)->current;
+
+      key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs);
+      key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
+   }
+
+   si_update_vgt_shader_config(sctx, key);
+
+   if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
+
+   if (sctx->ps_shader.cso) {
+      unsigned db_shader_control;
+
+      r = si_shader_select(ctx, &sctx->ps_shader, key, &compiler_state);
+      if (r)
+         return false;
+      si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
+
+      db_shader_control = sctx->ps_shader.cso->db_shader_control |
+                          S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS);
+
+      if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) ||
+          (key.u.ngg && si_pm4_state_changed(sctx, gs)) ||
+          sctx->sprite_coord_enable != rs->sprite_coord_enable ||
+          sctx->flatshade != rs->flatshade) {
+         sctx->sprite_coord_enable = rs->sprite_coord_enable;
+         sctx->flatshade = rs->flatshade;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
+      }
+
+      if (sctx->screen->info.rbplus_allowed && si_pm4_state_changed(sctx, ps) &&
+          (!old_ps || old_spi_shader_col_format !=
+                         sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format))
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+
+      if (sctx->ps_db_shader_control != db_shader_control) {
+         sctx->ps_db_shader_control = db_shader_control;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+         if (sctx->screen->dpbb_allowed)
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+      }
+
+      if (sctx->smoothing_enabled !=
+          sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing) {
+         sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+
+         if (sctx->chip_class == GFX6)
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+
+         if (sctx->framebuffer.nr_samples <= 1)
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
+      }
+   }
+
+   if (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, hs) ||
+       si_pm4_state_enabled_and_changed(sctx, es) || si_pm4_state_enabled_and_changed(sctx, gs) ||
+       si_pm4_state_enabled_and_changed(sctx, vs) || si_pm4_state_enabled_and_changed(sctx, ps)) {
+      if (!si_update_spi_tmpring_size(sctx))
+         return false;
+   }
+
+   if (sctx->chip_class >= GFX7) {
+      if (si_pm4_state_enabled_and_changed(sctx, ls))
+         sctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+      else if (!sctx->queued.named.ls)
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS;
+
+      if (si_pm4_state_enabled_and_changed(sctx, hs))
+         sctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+      else if (!sctx->queued.named.hs)
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS;
+
+      if (si_pm4_state_enabled_and_changed(sctx, es))
+         sctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+      else if (!sctx->queued.named.es)
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES;
+
+      if (si_pm4_state_enabled_and_changed(sctx, gs))
+         sctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+      else if (!sctx->queued.named.gs)
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS;
+
+      if (si_pm4_state_enabled_and_changed(sctx, vs))
+         sctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+      else if (!sctx->queued.named.vs)
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS;
+
+      if (si_pm4_state_enabled_and_changed(sctx, ps))
+         sctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+      else if (!sctx->queued.named.ps)
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS;
+   }
+
+   sctx->do_update_shaders = false;
+   return true;
  }
  
  static void si_emit_scratch_state(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
  
-       radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
-                              sctx->spi_tmpring_size);
+   radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, sctx->spi_tmpring_size);
  
-       if (sctx->scratch_buffer) {
-               radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-                                     sctx->scratch_buffer, RADEON_USAGE_READWRITE,
-                                     RADEON_PRIO_SCRATCH_BUFFER);
-       }
+   if (sctx->scratch_buffer) {
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->scratch_buffer, RADEON_USAGE_READWRITE,
+                                RADEON_PRIO_SCRATCH_BUFFER);
+   }
  }
  
  void si_init_screen_live_shader_cache(struct si_screen *sscreen)
  {
-       util_live_shader_cache_init(&sscreen->live_shader_cache,
-                                   si_create_shader_selector,
-                                   si_destroy_shader_selector);
+   util_live_shader_cache_init(&sscreen->live_shader_cache, si_create_shader_selector,
+                               si_destroy_shader_selector);
  }
  
  void si_init_shader_functions(struct si_context *sctx)
  {
-       sctx->atoms.s.spi_map.emit = si_emit_spi_map;
-       sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
-
-       sctx->b.create_vs_state = si_create_shader;
-       sctx->b.create_tcs_state = si_create_shader;
-       sctx->b.create_tes_state = si_create_shader;
-       sctx->b.create_gs_state = si_create_shader;
-       sctx->b.create_fs_state = si_create_shader;
-
-       sctx->b.bind_vs_state = si_bind_vs_shader;
-       sctx->b.bind_tcs_state = si_bind_tcs_shader;
-       sctx->b.bind_tes_state = si_bind_tes_shader;
-       sctx->b.bind_gs_state = si_bind_gs_shader;
-       sctx->b.bind_fs_state = si_bind_ps_shader;
-
-       sctx->b.delete_vs_state = si_delete_shader_selector;
-       sctx->b.delete_tcs_state = si_delete_shader_selector;
-       sctx->b.delete_tes_state = si_delete_shader_selector;
-       sctx->b.delete_gs_state = si_delete_shader_selector;
-       sctx->b.delete_fs_state = si_delete_shader_selector;
+   sctx->atoms.s.spi_map.emit = si_emit_spi_map;
+   sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
+
+   sctx->b.create_vs_state = si_create_shader;
+   sctx->b.create_tcs_state = si_create_shader;
+   sctx->b.create_tes_state = si_create_shader;
+   sctx->b.create_gs_state = si_create_shader;
+   sctx->b.create_fs_state = si_create_shader;
+
+   sctx->b.bind_vs_state = si_bind_vs_shader;
+   sctx->b.bind_tcs_state = si_bind_tcs_shader;
+   sctx->b.bind_tes_state = si_bind_tes_shader;
+   sctx->b.bind_gs_state = si_bind_gs_shader;
+   sctx->b.bind_fs_state = si_bind_ps_shader;
+
+   sctx->b.delete_vs_state = si_delete_shader_selector;
+   sctx->b.delete_tcs_state = si_delete_shader_selector;
+   sctx->b.delete_tes_state = si_delete_shader_selector;
+   sctx->b.delete_gs_state = si_delete_shader_selector;
+   sctx->b.delete_fs_state = si_delete_shader_selector;
  }
diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c

index 85ac4a119c50eb62d5f83cb397859095117a43d9..2ce8de0ccdea7b3fa7f68ac729c379fbd25743d6 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@@ -23,395 +23,372 @@
   */
  
  #include "si_build_pm4.h"
-
  #include "util/u_memory.h"
  #include "util/u_suballoc.h"
  
  static void si_set_streamout_enable(struct si_context *sctx, bool enable);
  
  static inline void si_so_target_reference(struct si_streamout_target **dst,
-                                         struct pipe_stream_output_target *src)
+                                          struct pipe_stream_output_target *src)
  {
-       pipe_so_target_reference((struct pipe_stream_output_target**)dst, src);
+   pipe_so_target_reference((struct pipe_stream_output_target **)dst, src);
  }
  
-static struct pipe_stream_output_target *
-si_create_so_target(struct pipe_context *ctx,
-                   struct pipe_resource *buffer,
-                   unsigned buffer_offset,
-                   unsigned buffer_size)
+static struct pipe_stream_output_target *si_create_so_target(struct pipe_context *ctx,
+                                                             struct pipe_resource *buffer,
+                                                             unsigned buffer_offset,
+                                                             unsigned buffer_size)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_streamout_target *t;
-       struct si_resource *buf = si_resource(buffer);
-
-       t = CALLOC_STRUCT(si_streamout_target);
-       if (!t) {
-               return NULL;
-       }
-
-       unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4;
-       u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
-                            &t->buf_filled_size_offset,
-                            (struct pipe_resource**)&t->buf_filled_size);
-       if (!t->buf_filled_size) {
-               FREE(t);
-               return NULL;
-       }
-
-       t->b.reference.count = 1;
-       t->b.context = ctx;
-       pipe_resource_reference(&t->b.buffer, buffer);
-       t->b.buffer_offset = buffer_offset;
-       t->b.buffer_size = buffer_size;
-
-       util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset,
-                      buffer_offset + buffer_size);
-       return &t->b;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_streamout_target *t;
+   struct si_resource *buf = si_resource(buffer);
+
+   t = CALLOC_STRUCT(si_streamout_target);
+   if (!t) {
+      return NULL;
+   }
+
+   unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4;
+   u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
+                        &t->buf_filled_size_offset, (struct pipe_resource **)&t->buf_filled_size);
+   if (!t->buf_filled_size) {
+      FREE(t);
+      return NULL;
+   }
+
+   t->b.reference.count = 1;
+   t->b.context = ctx;
+   pipe_resource_reference(&t->b.buffer, buffer);
+   t->b.buffer_offset = buffer_offset;
+   t->b.buffer_size = buffer_size;
+
+   util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, buffer_offset + buffer_size);
+   return &t->b;
  }
  
-static void si_so_target_destroy(struct pipe_context *ctx,
-                                struct pipe_stream_output_target *target)
+static void si_so_target_destroy(struct pipe_context *ctx, struct pipe_stream_output_target *target)
  {
-       struct si_streamout_target *t = (struct si_streamout_target*)target;
-       pipe_resource_reference(&t->b.buffer, NULL);
-       si_resource_reference(&t->buf_filled_size, NULL);
-       FREE(t);
+   struct si_streamout_target *t = (struct si_streamout_target *)target;
+   pipe_resource_reference(&t->b.buffer, NULL);
+   si_resource_reference(&t->buf_filled_size, NULL);
+   FREE(t);
  }
  
  void si_streamout_buffers_dirty(struct si_context *sctx)
  {
-       if (!sctx->streamout.enabled_mask)
-               return;
+   if (!sctx->streamout.enabled_mask)
+      return;
  
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
-       si_set_streamout_enable(sctx, true);
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
+   si_set_streamout_enable(sctx, true);
  }
  
-static void si_set_streamout_targets(struct pipe_context *ctx,
-                                    unsigned num_targets,
-                                    struct pipe_stream_output_target **targets,
-                                    const unsigned *offsets)
+static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targets,
+                                     struct pipe_stream_output_target **targets,
+                                     const unsigned *offsets)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
-       unsigned old_num_targets = sctx->streamout.num_targets;
-       unsigned i;
-       bool wait_now = false;
-
-       /* We are going to unbind the buffers. Mark which caches need to be flushed. */
-       if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
-               /* Since streamout uses vector writes which go through TC L2
-                * and most other clients can use TC L2 as well, we don't need
-                * to flush it.
-                *
-                * The only cases which requires flushing it is VGT DMA index
-                * fetching (on <= GFX7) and indirect draw data, which are rare
-                * cases. Thus, flag the TC L2 dirtiness in the resource and
-                * handle it at draw call time.
-                */
-               for (i = 0; i < sctx->streamout.num_targets; i++)
-                       if (sctx->streamout.targets[i])
-                               si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
-
-               /* Invalidate the scalar cache in case a streamout buffer is
-                * going to be used as a constant buffer.
-                *
-                * Invalidate vL1, because streamout bypasses it (done by
-                * setting GLC=1 in the store instruction), but vL1 in other
-                * CUs can contain outdated data of streamout buffers.
-                *
-                * VS_PARTIAL_FLUSH is required if the buffers are going to be
-                * used as an input immediately.
-                */
-               sctx->flags |= SI_CONTEXT_INV_SCACHE |
-                              SI_CONTEXT_INV_VCACHE;
-
-               /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
-               if (sctx->screen->use_ngg_streamout) {
-                       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
-
-                       /* Wait now. This is needed to make sure that GDS is not
-                        * busy at the end of IBs.
-                        *
-                        * Also, the next streamout operation will overwrite GDS,
-                        * so we need to make sure that it's idle.
-                        */
-                       wait_now = true;
-               } else {
-                       sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
-               }
-       }
-
-       /* All readers of the streamout targets need to be finished before we can
-        * start writing to the targets.
-        */
-       if (num_targets) {
-               if (sctx->screen->use_ngg_streamout)
-                       si_allocate_gds(sctx);
-
-               sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-                              SI_CONTEXT_CS_PARTIAL_FLUSH;
-       }
-
-       /* Streamout buffers must be bound in 2 places:
-        * 1) in VGT by setting the VGT_STRMOUT registers
-        * 2) as shader resources
-        */
-
-       /* Stop streamout. */
-       if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
-               si_emit_streamout_end(sctx);
-
-       /* Set the new targets. */
-       unsigned enabled_mask = 0, append_bitmask = 0;
-       for (i = 0; i < num_targets; i++) {
-               si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
-               if (!targets[i])
-                       continue;
-
-               si_context_add_resource_size(sctx, targets[i]->buffer);
-               enabled_mask |= 1 << i;
-
-               if (offsets[i] == ((unsigned)-1))
-                       append_bitmask |= 1 << i;
-       }
-
-       for (; i < sctx->streamout.num_targets; i++)
-               si_so_target_reference(&sctx->streamout.targets[i], NULL);
-
-       sctx->streamout.enabled_mask = enabled_mask;
-       sctx->streamout.num_targets = num_targets;
-       sctx->streamout.append_bitmask = append_bitmask;
-
-       /* Update dirty state bits. */
-       if (num_targets) {
-               si_streamout_buffers_dirty(sctx);
-       } else {
-               si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
-               si_set_streamout_enable(sctx, false);
-       }
-
-       /* Set the shader resources.*/
-       for (i = 0; i < num_targets; i++) {
-               if (targets[i]) {
-                       struct pipe_shader_buffer sbuf;
-                       sbuf.buffer = targets[i]->buffer;
-
-                       if (sctx->screen->use_ngg_streamout) {
-                               sbuf.buffer_offset = targets[i]->buffer_offset;
-                               sbuf.buffer_size = targets[i]->buffer_size;
-                       } else {
-                               sbuf.buffer_offset = 0;
-                               sbuf.buffer_size = targets[i]->buffer_offset +
-                                                  targets[i]->buffer_size;
-                       }
-
-                       si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
-                       si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
-               } else {
-                       si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
-               }
-       }
-       for (; i < old_num_targets; i++)
-               si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
-
-       if (wait_now)
-               sctx->emit_cache_flush(sctx);
+   struct si_context *sctx = (struct si_context *)ctx;
+   unsigned old_num_targets = sctx->streamout.num_targets;
+   unsigned i;
+   bool wait_now = false;
+
+   /* We are going to unbind the buffers. Mark which caches need to be flushed. */
+   if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
+      /* Since streamout uses vector writes which go through TC L2
+       * and most other clients can use TC L2 as well, we don't need
+       * to flush it.
+       *
+       * The only cases which requires flushing it is VGT DMA index
+       * fetching (on <= GFX7) and indirect draw data, which are rare
+       * cases. Thus, flag the TC L2 dirtiness in the resource and
+       * handle it at draw call time.
+       */
+      for (i = 0; i < sctx->streamout.num_targets; i++)
+         if (sctx->streamout.targets[i])
+            si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
+
+      /* Invalidate the scalar cache in case a streamout buffer is
+       * going to be used as a constant buffer.
+       *
+       * Invalidate vL1, because streamout bypasses it (done by
+       * setting GLC=1 in the store instruction), but vL1 in other
+       * CUs can contain outdated data of streamout buffers.
+       *
+       * VS_PARTIAL_FLUSH is required if the buffers are going to be
+       * used as an input immediately.
+       */
+      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
+
+      /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
+      if (sctx->screen->use_ngg_streamout) {
+         sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+
+         /* Wait now. This is needed to make sure that GDS is not
+          * busy at the end of IBs.
+          *
+          * Also, the next streamout operation will overwrite GDS,
+          * so we need to make sure that it's idle.
+          */
+         wait_now = true;
+      } else {
+         sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
+      }
+   }
+
+   /* All readers of the streamout targets need to be finished before we can
+    * start writing to the targets.
+    */
+   if (num_targets) {
+      if (sctx->screen->use_ngg_streamout)
+         si_allocate_gds(sctx);
+
+      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+   }
+
+   /* Streamout buffers must be bound in 2 places:
+    * 1) in VGT by setting the VGT_STRMOUT registers
+    * 2) as shader resources
+    */
+
+   /* Stop streamout. */
+   if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
+      si_emit_streamout_end(sctx);
+
+   /* Set the new targets. */
+   unsigned enabled_mask = 0, append_bitmask = 0;
+   for (i = 0; i < num_targets; i++) {
+      si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
+      if (!targets[i])
+         continue;
+
+      si_context_add_resource_size(sctx, targets[i]->buffer);
+      enabled_mask |= 1 << i;
+
+      if (offsets[i] == ((unsigned)-1))
+         append_bitmask |= 1 << i;
+   }
+
+   for (; i < sctx->streamout.num_targets; i++)
+      si_so_target_reference(&sctx->streamout.targets[i], NULL);
+
+   sctx->streamout.enabled_mask = enabled_mask;
+   sctx->streamout.num_targets = num_targets;
+   sctx->streamout.append_bitmask = append_bitmask;
+
+   /* Update dirty state bits. */
+   if (num_targets) {
+      si_streamout_buffers_dirty(sctx);
+   } else {
+      si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
+      si_set_streamout_enable(sctx, false);
+   }
+
+   /* Set the shader resources.*/
+   for (i = 0; i < num_targets; i++) {
+      if (targets[i]) {
+         struct pipe_shader_buffer sbuf;
+         sbuf.buffer = targets[i]->buffer;
+
+         if (sctx->screen->use_ngg_streamout) {
+            sbuf.buffer_offset = targets[i]->buffer_offset;
+            sbuf.buffer_size = targets[i]->buffer_size;
+         } else {
+            sbuf.buffer_offset = 0;
+            sbuf.buffer_size = targets[i]->buffer_offset + targets[i]->buffer_size;
+         }
+
+         si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
+         si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
+      } else {
+         si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
+      }
+   }
+   for (; i < old_num_targets; i++)
+      si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
+
+   if (wait_now)
+      sctx->emit_cache_flush(sctx);
  }
  
  static void gfx10_emit_streamout_begin(struct si_context *sctx)
  {
-       struct si_streamout_target **t = sctx->streamout.targets;
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       unsigned last_target = 0;
-
-       for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
-               if (t[i])
-                       last_target = i;
-       }
-
-       for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
-               if (!t[i])
-                       continue;
-
-               t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
-
-               bool append = sctx->streamout.append_bitmask & (1 << i);
-               uint64_t va = 0;
-
-               if (append) {
-                       radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
-                                                 t[i]->buf_filled_size,
-                                                 RADEON_USAGE_READ,
-                                                 RADEON_PRIO_SO_FILLED_SIZE);
-
-                       va = t[i]->buf_filled_size->gpu_address +
-                            t[i]->buf_filled_size_offset;
-               }
-
-               radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-               radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
-                               S_411_DST_SEL(V_411_GDS) |
-                               S_411_CP_SYNC(i == last_target));
-               radeon_emit(cs, va);
-               radeon_emit(cs, va >> 32);
-               radeon_emit(cs, 4 * i); /* destination in GDS */
-               radeon_emit(cs, 0);
-               radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) |
-                               S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
-       }
-
-       sctx->streamout.begin_emitted = true;
+   struct si_streamout_target **t = sctx->streamout.targets;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned last_target = 0;
+
+   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
+      if (t[i])
+         last_target = i;
+   }
+
+   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
+      if (!t[i])
+         continue;
+
+      t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
+
+      bool append = sctx->streamout.append_bitmask & (1 << i);
+      uint64_t va = 0;
+
+      if (append) {
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ,
+                                   RADEON_PRIO_SO_FILLED_SIZE);
+
+         va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+      }
+
+      radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+      radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
+                         S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+      radeon_emit(cs, 4 * i); /* destination in GDS */
+      radeon_emit(cs, 0);
+      radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) | S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
+   }
+
+   sctx->streamout.begin_emitted = true;
  }
  
  static void gfx10_emit_streamout_end(struct si_context *sctx)
  {
-       struct si_streamout_target **t = sctx->streamout.targets;
+   struct si_streamout_target **t = sctx->streamout.targets;
  
-       for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
-               if (!t[i])
-                       continue;
+   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
+      if (!t[i])
+         continue;
  
-               uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+      uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
  
-               si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0,
-                                 EOP_DST_SEL_TC_L2,
-                                 EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
-                                 EOP_DATA_SEL_GDS,
-                                 t[i]->buf_filled_size, va,
-                                 EOP_DATA_GDS(i, 1), 0);
+      si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
+                        EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS,
+                        t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0);
  
-               t[i]->buf_filled_size_valid = true;
-       }
+      t[i]->buf_filled_size_valid = true;
+   }
  
-       sctx->streamout.begin_emitted = false;
+   sctx->streamout.begin_emitted = false;
  }
  
  static void si_flush_vgt_streamout(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       unsigned reg_strmout_cntl;
-
-       /* The register is at different places on different ASICs. */
-       if (sctx->chip_class >= GFX7) {
-               reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
-               radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
-       } else {
-               reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
-               radeon_set_config_reg(cs, reg_strmout_cntl, 0);
-       }
-
-       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-       radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
-
-       radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-       radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
-       radeon_emit(cs, reg_strmout_cntl >> 2);  /* register */
-       radeon_emit(cs, 0);
-       radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
-       radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
-       radeon_emit(cs, 4); /* poll interval */
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned reg_strmout_cntl;
+
+   /* The register is at different places on different ASICs. */
+   if (sctx->chip_class >= GFX7) {
+      reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
+      radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
+   } else {
+      reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
+      radeon_set_config_reg(cs, reg_strmout_cntl, 0);
+   }
+
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
+
+   radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+   radeon_emit(cs,
+               WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
+   radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
+   radeon_emit(cs, 0);
+   radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
+   radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
+   radeon_emit(cs, 4);                              /* poll interval */
  }
  
  static void si_emit_streamout_begin(struct si_context *sctx)
  {
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       struct si_streamout_target **t = sctx->streamout.targets;
-       uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
-       unsigned i;
-
-       si_flush_vgt_streamout(sctx);
-
-       for (i = 0; i < sctx->streamout.num_targets; i++) {
-               if (!t[i])
-                       continue;
-
-               t[i]->stride_in_dw = stride_in_dw[i];
-
-               /* AMD GCN binds streamout buffers as shader resources.
-                * VGT only counts primitives and tells the shader
-                * through SGPRs what to do. */
-               radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
-               radeon_emit(cs, (t[i]->b.buffer_offset +
-                                t[i]->b.buffer_size) >> 2);    /* BUFFER_SIZE (in DW) */
-               radeon_emit(cs, stride_in_dw[i]);               /* VTX_STRIDE (in DW) */
-
-               if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
-                       uint64_t va = t[i]->buf_filled_size->gpu_address +
-                                     t[i]->buf_filled_size_offset;
-
-                       /* Append. */
-                       radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-                       radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
-                                   STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
-                       radeon_emit(cs, 0); /* unused */
-                       radeon_emit(cs, 0); /* unused */
-                       radeon_emit(cs, va); /* src address lo */
-                       radeon_emit(cs, va >> 32); /* src address hi */
-
-                       radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
-                                                 t[i]->buf_filled_size,
-                                                 RADEON_USAGE_READ,
-                                                 RADEON_PRIO_SO_FILLED_SIZE);
-               } else {
-                       /* Start from the beginning. */
-                       radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-                       radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
-                                   STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
-                       radeon_emit(cs, 0); /* unused */
-                       radeon_emit(cs, 0); /* unused */
-                       radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
-                       radeon_emit(cs, 0); /* unused */
-               }
-       }
-
-       sctx->streamout.begin_emitted = true;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct si_streamout_target **t = sctx->streamout.targets;
+   uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
+   unsigned i;
+
+   si_flush_vgt_streamout(sctx);
+
+   for (i = 0; i < sctx->streamout.num_targets; i++) {
+      if (!t[i])
+         continue;
+
+      t[i]->stride_in_dw = stride_in_dw[i];
+
+      /* AMD GCN binds streamout buffers as shader resources.
+       * VGT only counts primitives and tells the shader
+       * through SGPRs what to do. */
+      radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
+      radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
+      radeon_emit(cs, stride_in_dw[i]);                                    /* VTX_STRIDE (in DW) */
+
+      if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
+         uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+
+         /* Append. */
+         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
+                            STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
+         radeon_emit(cs, 0);                                                 /* unused */
+         radeon_emit(cs, 0);                                                 /* unused */
+         radeon_emit(cs, va);                                                /* src address lo */
+         radeon_emit(cs, va >> 32);                                          /* src address hi */
+
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ,
+                                   RADEON_PRIO_SO_FILLED_SIZE);
+      } else {
+         /* Start from the beginning. */
+         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
+                            STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
+         radeon_emit(cs, 0);                                                    /* unused */
+         radeon_emit(cs, 0);                                                    /* unused */
+         radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
+         radeon_emit(cs, 0);                          /* unused */
+      }
+   }
+
+   sctx->streamout.begin_emitted = true;
  }
  
  void si_emit_streamout_end(struct si_context *sctx)
  {
-       if (sctx->screen->use_ngg_streamout) {
-               gfx10_emit_streamout_end(sctx);
-               return;
-       }
-
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       struct si_streamout_target **t = sctx->streamout.targets;
-       unsigned i;
-       uint64_t va;
-
-       si_flush_vgt_streamout(sctx);
-
-       for (i = 0; i < sctx->streamout.num_targets; i++) {
-               if (!t[i])
-                       continue;
-
-               va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
-               radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-               radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
-                           STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
-                           STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
-               radeon_emit(cs, va);     /* dst address lo */
-               radeon_emit(cs, va >> 32); /* dst address hi */
-               radeon_emit(cs, 0); /* unused */
-               radeon_emit(cs, 0); /* unused */
-
-               radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
-                                         t[i]->buf_filled_size,
-                                         RADEON_USAGE_WRITE,
-                                         RADEON_PRIO_SO_FILLED_SIZE);
-
-               /* Zero the buffer size. The counters (primitives generated,
-                * primitives emitted) may be enabled even if there is not
-                * buffer bound. This ensures that the primitives-emitted query
-                * won't increment. */
-               radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
-               sctx->context_roll = true;
-
-               t[i]->buf_filled_size_valid = true;
-       }
-
-       sctx->streamout.begin_emitted = false;
+   if (sctx->screen->use_ngg_streamout) {
+      gfx10_emit_streamout_end(sctx);
+      return;
+   }
+
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct si_streamout_target **t = sctx->streamout.targets;
+   unsigned i;
+   uint64_t va;
+
+   si_flush_vgt_streamout(sctx);
+
+   for (i = 0; i < sctx->streamout.num_targets; i++) {
+      if (!t[i])
+         continue;
+
+      va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+      radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+      radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
+                         STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
+      radeon_emit(cs, va);                                  /* dst address lo */
+      radeon_emit(cs, va >> 32);                            /* dst address hi */
+      radeon_emit(cs, 0);                                   /* unused */
+      radeon_emit(cs, 0);                                   /* unused */
+
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_WRITE,
+                                RADEON_PRIO_SO_FILLED_SIZE);
+
+      /* Zero the buffer size. The counters (primitives generated,
+       * primitives emitted) may be enabled even if there is not
+       * buffer bound. This ensures that the primitives-emitted query
+       * won't increment. */
+      radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
+      sctx->context_roll = true;
+
+      t[i]->buf_filled_size_valid = true;
+   }
+
+   sctx->streamout.begin_emitted = false;
  }
  
  /* STREAMOUT CONFIG DERIVED STATE
@@ -423,71 +400,65 @@ void si_emit_streamout_end(struct si_context *sctx)
  
  static void si_emit_streamout_enable(struct si_context *sctx)
  {
-       assert(!sctx->screen->use_ngg_streamout);
-
-       radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
-       radeon_emit(sctx->gfx_cs,
-                   S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
-                   S_028B94_RAST_STREAM(0) |
-                   S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
-                   S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
-                   S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
-       radeon_emit(sctx->gfx_cs,
-                   sctx->streamout.hw_enabled_mask &
-                   sctx->streamout.enabled_stream_buffers_mask);
+   assert(!sctx->screen->use_ngg_streamout);
+
+   radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
+   radeon_emit(sctx->gfx_cs, S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
+                                S_028B94_RAST_STREAM(0) |
+                                S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
+                                S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
+                                S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
+   radeon_emit(sctx->gfx_cs,
+               sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
  }
  
  static void si_set_streamout_enable(struct si_context *sctx, bool enable)
  {
-       bool old_strmout_en = si_get_strmout_en(sctx);
-       unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
+   bool old_strmout_en = si_get_strmout_en(sctx);
+   unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
  
-       sctx->streamout.streamout_enabled = enable;
+   sctx->streamout.streamout_enabled = enable;
  
-       sctx->streamout.hw_enabled_mask = sctx->streamout.enabled_mask |
-                                         (sctx->streamout.enabled_mask << 4) |
-                                         (sctx->streamout.enabled_mask << 8) |
-                                         (sctx->streamout.enabled_mask << 12);
+   sctx->streamout.hw_enabled_mask =
+      sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) |
+      (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12);
  
-       if (!sctx->screen->use_ngg_streamout &&
-           ((old_strmout_en != si_get_strmout_en(sctx)) ||
-            (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
-               si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
+   if (!sctx->screen->use_ngg_streamout &&
+       ((old_strmout_en != si_get_strmout_en(sctx)) ||
+        (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
  }
  
-void si_update_prims_generated_query_state(struct si_context *sctx,
-                                          unsigned type, int diff)
+void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff)
  {
-       if (!sctx->screen->use_ngg_streamout &&
-           type == PIPE_QUERY_PRIMITIVES_GENERATED) {
-               bool old_strmout_en = si_get_strmout_en(sctx);
+   if (!sctx->screen->use_ngg_streamout && type == PIPE_QUERY_PRIMITIVES_GENERATED) {
+      bool old_strmout_en = si_get_strmout_en(sctx);
  
-               sctx->streamout.num_prims_gen_queries += diff;
-               assert(sctx->streamout.num_prims_gen_queries >= 0);
+      sctx->streamout.num_prims_gen_queries += diff;
+      assert(sctx->streamout.num_prims_gen_queries >= 0);
  
-               sctx->streamout.prims_gen_query_enabled =
-                       sctx->streamout.num_prims_gen_queries != 0;
+      sctx->streamout.prims_gen_query_enabled = sctx->streamout.num_prims_gen_queries != 0;
  
-               if (old_strmout_en != si_get_strmout_en(sctx))
-                       si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
+      if (old_strmout_en != si_get_strmout_en(sctx))
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
  
-               if (si_update_ngg(sctx)) {
-                       si_shader_change_notify(sctx);
-                       sctx->do_update_shaders = true;
-               }
-       }
+      if (si_update_ngg(sctx)) {
+         si_shader_change_notify(sctx);
+         sctx->do_update_shaders = true;
+      }
+   }
  }
  
  void si_init_streamout_functions(struct si_context *sctx)
  {
-       sctx->b.create_stream_output_target = si_create_so_target;
-       sctx->b.stream_output_target_destroy = si_so_target_destroy;
-       sctx->b.set_stream_output_targets = si_set_streamout_targets;
-
-       if (sctx->screen->use_ngg_streamout) {
-               sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
-       } else {
-               sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
-               sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
-       }
+   sctx->b.create_stream_output_target = si_create_so_target;
+   sctx->b.stream_output_target_destroy = si_so_target_destroy;
+   sctx->b.set_stream_output_targets = si_set_streamout_targets;
+
+   if (sctx->screen->use_ngg_streamout) {
+      sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
+   } else {
+      sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
+      sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
+   }
  }
diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c

index 682f00d44a87d8a92770a93f44f9a358c469cd8a..5149ee1c643fe70443bae9375590dd9dbc5c74b3 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -30,541 +30,512 @@
  
  void si_update_ngg_small_prim_precision(struct si_context *ctx)
  {
-       if (!ctx->screen->use_ngg_culling)
-               return;
-
-       /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */
-       unsigned num_samples = ctx->framebuffer.nr_samples;
-       unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode;
-       float precision;
-
-       if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
-               precision = num_samples / 4096.0;
-       else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
-               precision = num_samples / 1024.0;
-       else
-               precision = num_samples / 256.0;
-
-       ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
-       ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23);
+   if (!ctx->screen->use_ngg_culling)
+      return;
+
+   /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */
+   unsigned num_samples = ctx->framebuffer.nr_samples;
+   unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode;
+   float precision;
+
+   if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
+      precision = num_samples / 4096.0;
+   else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
+      precision = num_samples / 1024.0;
+   else
+      precision = num_samples / 256.0;
+
+   ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
+   ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23);
  }
  
-void si_get_small_prim_cull_info(struct si_context *sctx,
-                                struct si_small_prim_cull_info *out)
+void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out)
  {
-       /* This is needed by the small primitive culling, because it's done
-        * in screen space.
-        */
-       struct si_small_prim_cull_info info;
-       unsigned num_samples = sctx->framebuffer.nr_samples;
-       assert(num_samples >= 1);
-
-       info.scale[0] = sctx->viewports.states[0].scale[0];
-       info.scale[1] = sctx->viewports.states[0].scale[1];
-       info.translate[0] = sctx->viewports.states[0].translate[0];
-       info.translate[1] = sctx->viewports.states[0].translate[1];
-
-       /* The viewport shouldn't flip the X axis for the small prim culling to work. */
-       assert(-info.scale[0] + info.translate[0] <= info.scale[0] + info.translate[0]);
-
-       /* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
-        * This is because the viewport transformation inverts the clip space
-        * bounding box, so min becomes max, which breaks small primitive
-        * culling.
-        */
-       if (sctx->viewports.y_inverted) {
-               info.scale[1] = -info.scale[1];
-               info.translate[1] = -info.translate[1];
-       }
-
-       /* Scale the framebuffer up, so that samples become pixels and small
-        * primitive culling is the same for all sample counts.
-        * This only works with the standard DX sample positions, because
-        * the samples are evenly spaced on both X and Y axes.
-        */
-       for (unsigned i = 0; i < 2; i++) {
-               info.scale[i] *= num_samples;
-               info.translate[i] *= num_samples;
-       }
-       *out = info;
+   /* This is needed by the small primitive culling, because it's done
+    * in screen space.
+    */
+   struct si_small_prim_cull_info info;
+   unsigned num_samples = sctx->framebuffer.nr_samples;
+   assert(num_samples >= 1);
+
+   info.scale[0] = sctx->viewports.states[0].scale[0];
+   info.scale[1] = sctx->viewports.states[0].scale[1];
+   info.translate[0] = sctx->viewports.states[0].translate[0];
+   info.translate[1] = sctx->viewports.states[0].translate[1];
+
+   /* The viewport shouldn't flip the X axis for the small prim culling to work. */
+   assert(-info.scale[0] + info.translate[0] <= info.scale[0] + info.translate[0]);
+
+   /* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
+    * This is because the viewport transformation inverts the clip space
+    * bounding box, so min becomes max, which breaks small primitive
+    * culling.
+    */
+   if (sctx->viewports.y_inverted) {
+      info.scale[1] = -info.scale[1];
+      info.translate[1] = -info.translate[1];
+   }
+
+   /* Scale the framebuffer up, so that samples become pixels and small
+    * primitive culling is the same for all sample counts.
+    * This only works with the standard DX sample positions, because
+    * the samples are evenly spaced on both X and Y axes.
+    */
+   for (unsigned i = 0; i < 2; i++) {
+      info.scale[i] *= num_samples;
+      info.translate[i] *= num_samples;
+   }
+   *out = info;
  }
  
-static void si_set_scissor_states(struct pipe_context *pctx,
-                                 unsigned start_slot,
-                                 unsigned num_scissors,
-                                 const struct pipe_scissor_state *state)
+static void si_set_scissor_states(struct pipe_context *pctx, unsigned start_slot,
+                                  unsigned num_scissors, const struct pipe_scissor_state *state)
  {
-       struct si_context *ctx = (struct si_context *)pctx;
-       int i;
+   struct si_context *ctx = (struct si_context *)pctx;
+   int i;
  
-       for (i = 0; i < num_scissors; i++)
-               ctx->scissors[start_slot + i] = state[i];
+   for (i = 0; i < num_scissors; i++)
+      ctx->scissors[start_slot + i] = state[i];
  
-       if (!ctx->queued.named.rasterizer->scissor_enable)
-               return;
+   if (!ctx->queued.named.rasterizer->scissor_enable)
+      return;
  
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
  }
  
  /* Since the guard band disables clipping, we have to clip per-pixel
   * using a scissor.
   */
  static void si_get_scissor_from_viewport(struct si_context *ctx,
-                                        const struct pipe_viewport_state *vp,
-                                        struct si_signed_scissor *scissor)
+                                         const struct pipe_viewport_state *vp,
+                                         struct si_signed_scissor *scissor)
  {
-       float tmp, minx, miny, maxx, maxy;
-
-       /* Convert (-1, -1) and (1, 1) from clip space into window space. */
-       minx = -vp->scale[0] + vp->translate[0];
-       miny = -vp->scale[1] + vp->translate[1];
-       maxx = vp->scale[0] + vp->translate[0];
-       maxy = vp->scale[1] + vp->translate[1];
-
-       /* Handle inverted viewports. */
-       if (minx > maxx) {
-               tmp = minx;
-               minx = maxx;
-               maxx = tmp;
-       }
-       if (miny > maxy) {
-               tmp = miny;
-               miny = maxy;
-               maxy = tmp;
-       }
-
-       /* Convert to integer and round up the max bounds. */
-       scissor->minx = minx;
-       scissor->miny = miny;
-       scissor->maxx = ceilf(maxx);
-       scissor->maxy = ceilf(maxy);
+   float tmp, minx, miny, maxx, maxy;
+
+   /* Convert (-1, -1) and (1, 1) from clip space into window space. */
+   minx = -vp->scale[0] + vp->translate[0];
+   miny = -vp->scale[1] + vp->translate[1];
+   maxx = vp->scale[0] + vp->translate[0];
+   maxy = vp->scale[1] + vp->translate[1];
+
+   /* Handle inverted viewports. */
+   if (minx > maxx) {
+      tmp = minx;
+      minx = maxx;
+      maxx = tmp;
+   }
+   if (miny > maxy) {
+      tmp = miny;
+      miny = maxy;
+      maxy = tmp;
+   }
+
+   /* Convert to integer and round up the max bounds. */
+   scissor->minx = minx;
+   scissor->miny = miny;
+   scissor->maxx = ceilf(maxx);
+   scissor->maxy = ceilf(maxy);
  }
  
-static void si_clamp_scissor(struct si_context *ctx,
-                            struct pipe_scissor_state *out,
-                            struct si_signed_scissor *scissor)
+static void si_clamp_scissor(struct si_context *ctx, struct pipe_scissor_state *out,
+                             struct si_signed_scissor *scissor)
  {
-       out->minx = CLAMP(scissor->minx, 0, SI_MAX_SCISSOR);
-       out->miny = CLAMP(scissor->miny, 0, SI_MAX_SCISSOR);
-       out->maxx = CLAMP(scissor->maxx, 0, SI_MAX_SCISSOR);
-       out->maxy = CLAMP(scissor->maxy, 0, SI_MAX_SCISSOR);
+   out->minx = CLAMP(scissor->minx, 0, SI_MAX_SCISSOR);
+   out->miny = CLAMP(scissor->miny, 0, SI_MAX_SCISSOR);
+   out->maxx = CLAMP(scissor->maxx, 0, SI_MAX_SCISSOR);
+   out->maxy = CLAMP(scissor->maxy, 0, SI_MAX_SCISSOR);
  }
  
-static void si_clip_scissor(struct pipe_scissor_state *out,
-                           struct pipe_scissor_state *clip)
+static void si_clip_scissor(struct pipe_scissor_state *out, struct pipe_scissor_state *clip)
  {
-       out->minx = MAX2(out->minx, clip->minx);
-       out->miny = MAX2(out->miny, clip->miny);
-       out->maxx = MIN2(out->maxx, clip->maxx);
-       out->maxy = MIN2(out->maxy, clip->maxy);
+   out->minx = MAX2(out->minx, clip->minx);
+   out->miny = MAX2(out->miny, clip->miny);
+   out->maxx = MIN2(out->maxx, clip->maxx);
+   out->maxy = MIN2(out->maxy, clip->maxy);
  }
  
-static void si_scissor_make_union(struct si_signed_scissor *out,
-                                 struct si_signed_scissor *in)
+static void si_scissor_make_union(struct si_signed_scissor *out, struct si_signed_scissor *in)
  {
-       out->minx = MIN2(out->minx, in->minx);
-       out->miny = MIN2(out->miny, in->miny);
-       out->maxx = MAX2(out->maxx, in->maxx);
-       out->maxy = MAX2(out->maxy, in->maxy);
-       out->quant_mode = MIN2(out->quant_mode, in->quant_mode);
+   out->minx = MIN2(out->minx, in->minx);
+   out->miny = MIN2(out->miny, in->miny);
+   out->maxx = MAX2(out->maxx, in->maxx);
+   out->maxy = MAX2(out->maxy, in->maxy);
+   out->quant_mode = MIN2(out->quant_mode, in->quant_mode);
  }
  
-static void si_emit_one_scissor(struct si_context *ctx,
-                               struct radeon_cmdbuf *cs,
-                               struct si_signed_scissor *vp_scissor,
-                               struct pipe_scissor_state *scissor)
+static void si_emit_one_scissor(struct si_context *ctx, struct radeon_cmdbuf *cs,
+                                struct si_signed_scissor *vp_scissor,
+                                struct pipe_scissor_state *scissor)
  {
-       struct pipe_scissor_state final;
-
-       if (ctx->vs_disables_clipping_viewport) {
-               final.minx = final.miny = 0;
-               final.maxx = final.maxy = SI_MAX_SCISSOR;
-       } else {
-               si_clamp_scissor(ctx, &final, vp_scissor);
-       }
-
-       if (scissor)
-               si_clip_scissor(&final, scissor);
-
-       /* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_-
-        * SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
-        */
-       if (ctx->chip_class == GFX6 && (final.maxx == 0 || final.maxy == 0)) {
-               radeon_emit(cs, S_028250_TL_X(1) |
-                               S_028250_TL_Y(1) |
-                               S_028250_WINDOW_OFFSET_DISABLE(1));
-               radeon_emit(cs, S_028254_BR_X(1) |
-                               S_028254_BR_Y(1));
-               return;
-       }
-
-       radeon_emit(cs, S_028250_TL_X(final.minx) |
-                       S_028250_TL_Y(final.miny) |
-                       S_028250_WINDOW_OFFSET_DISABLE(1));
-       radeon_emit(cs, S_028254_BR_X(final.maxx) |
-                       S_028254_BR_Y(final.maxy));
+   struct pipe_scissor_state final;
+
+   if (ctx->vs_disables_clipping_viewport) {
+      final.minx = final.miny = 0;
+      final.maxx = final.maxy = SI_MAX_SCISSOR;
+   } else {
+      si_clamp_scissor(ctx, &final, vp_scissor);
+   }
+
+   if (scissor)
+      si_clip_scissor(&final, scissor);
+
+   /* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_-
+    * SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
+    */
+   if (ctx->chip_class == GFX6 && (final.maxx == 0 || final.maxy == 0)) {
+      radeon_emit(cs, S_028250_TL_X(1) | S_028250_TL_Y(1) | S_028250_WINDOW_OFFSET_DISABLE(1));
+      radeon_emit(cs, S_028254_BR_X(1) | S_028254_BR_Y(1));
+      return;
+   }
+
+   radeon_emit(cs, S_028250_TL_X(final.minx) | S_028250_TL_Y(final.miny) |
+                      S_028250_WINDOW_OFFSET_DISABLE(1));
+   radeon_emit(cs, S_028254_BR_X(final.maxx) | S_028254_BR_Y(final.maxy));
  }
  
  #define MAX_PA_SU_HARDWARE_SCREEN_OFFSET 8176
  
  static void si_emit_guardband(struct si_context *ctx)
  {
-       const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer;
-       struct si_signed_scissor vp_as_scissor;
-       struct pipe_viewport_state vp;
-       float left, top, right, bottom, max_range, guardband_x, guardband_y;
-       float discard_x, discard_y;
-
-       if (ctx->vs_writes_viewport_index) {
-               /* Shaders can draw to any viewport. Make a union of all
-                * viewports. */
-               vp_as_scissor = ctx->viewports.as_scissor[0];
-               for (unsigned i = 1; i < SI_MAX_VIEWPORTS; i++) {
-                       si_scissor_make_union(&vp_as_scissor,
-                                             &ctx->viewports.as_scissor[i]);
-               }
-       } else {
-               vp_as_scissor = ctx->viewports.as_scissor[0];
-       }
-
-       /* Blits don't set the viewport state. The vertex shader determines
-        * the viewport size by scaling the coordinates, so we don't know
-        * how large the viewport is. Assume the worst case.
-        */
-       if (ctx->vs_disables_clipping_viewport)
-               vp_as_scissor.quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
-
-       /* Determine the optimal hardware screen offset to center the viewport
-        * within the viewport range in order to maximize the guardband size.
-        */
-       int hw_screen_offset_x = (vp_as_scissor.maxx + vp_as_scissor.minx) / 2;
-       int hw_screen_offset_y = (vp_as_scissor.maxy + vp_as_scissor.miny) / 2;
-
-       /* GFX6-GFX7 need to align the offset to an ubertile consisting of all SEs. */
-       const unsigned hw_screen_offset_alignment =
-               ctx->chip_class >= GFX8 ? 16 : MAX2(ctx->screen->se_tile_repeat, 16);
-
-       /* Indexed by quantization modes */
-       static int max_viewport_size[] = {65535, 16383, 4095};
-
-       /* Ensure that the whole viewport stays representable in
-        * absolute coordinates.
-        * See comment in si_set_viewport_states.
-        */
-       assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] &&
-              vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]);
-
-       hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
-       hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
-
-       /* Align the screen offset by dropping the low bits. */
-       hw_screen_offset_x &= ~(hw_screen_offset_alignment - 1);
-       hw_screen_offset_y &= ~(hw_screen_offset_alignment - 1);
-
-       /* Apply the offset to center the viewport and maximize the guardband. */
-       vp_as_scissor.minx -= hw_screen_offset_x;
-       vp_as_scissor.maxx -= hw_screen_offset_x;
-       vp_as_scissor.miny -= hw_screen_offset_y;
-       vp_as_scissor.maxy -= hw_screen_offset_y;
-
-       /* Reconstruct the viewport transformation from the scissor. */
-       vp.translate[0] = (vp_as_scissor.minx + vp_as_scissor.maxx) / 2.0;
-       vp.translate[1] = (vp_as_scissor.miny + vp_as_scissor.maxy) / 2.0;
-       vp.scale[0] = vp_as_scissor.maxx - vp.translate[0];
-       vp.scale[1] = vp_as_scissor.maxy - vp.translate[1];
-
-       /* Treat a 0x0 viewport as 1x1 to prevent division by zero. */
-       if (vp_as_scissor.minx == vp_as_scissor.maxx)
-               vp.scale[0] = 0.5;
-       if (vp_as_scissor.miny == vp_as_scissor.maxy)
-               vp.scale[1] = 0.5;
-
-       /* Find the biggest guard band that is inside the supported viewport
-        * range. The guard band is specified as a horizontal and vertical
-        * distance from (0,0) in clip space.
-        *
-        * This is done by applying the inverse viewport transformation
-        * on the viewport limits to get those limits in clip space.
-        *
-        * The viewport range is [-max_viewport_size/2, max_viewport_size/2].
-        */
-       assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size));
-       max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2;
-       left   = (-max_range - vp.translate[0]) / vp.scale[0];
-       right  = ( max_range - vp.translate[0]) / vp.scale[0];
-       top    = (-max_range - vp.translate[1]) / vp.scale[1];
-       bottom = ( max_range - vp.translate[1]) / vp.scale[1];
-
-       assert(left <= -1 && top <= -1 && right >= 1 && bottom >= 1);
-
-       guardband_x = MIN2(-left, right);
-       guardband_y = MIN2(-top, bottom);
-
-       discard_x = 1.0;
-       discard_y = 1.0;
-
-       if (unlikely(util_prim_is_points_or_lines(ctx->current_rast_prim))) {
-               /* When rendering wide points or lines, we need to be more
-                * conservative about when to discard them entirely. */
-               float pixels;
-
-               if (ctx->current_rast_prim == PIPE_PRIM_POINTS)
-                       pixels = rs->max_point_size;
-               else
-                       pixels = rs->line_width;
-
-               /* Add half the point size / line width */
-               discard_x += pixels / (2.0 * vp.scale[0]);
-               discard_y += pixels / (2.0 * vp.scale[1]);
-
-               /* Discard primitives that would lie entirely outside the clip
-                * region. */
-               discard_x = MIN2(discard_x, guardband_x);
-               discard_y = MIN2(discard_y, guardband_y);
-       }
-
-       /* If any of the GB registers is updated, all of them must be updated.
-        * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ
-        * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ
-        */
-       unsigned initial_cdw = ctx->gfx_cs->current.cdw;
-       radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ,
-                                   SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ,
-                                   fui(guardband_y), fui(discard_y),
-                                   fui(guardband_x), fui(discard_x));
-       radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET,
-                                  SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
-                                  S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) |
-                                  S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4));
-       radeon_opt_set_context_reg(ctx, R_028BE4_PA_SU_VTX_CNTL,
-                                  SI_TRACKED_PA_SU_VTX_CNTL,
-                                  S_028BE4_PIX_CENTER(rs->half_pixel_center) |
-                                  S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH +
-                                                      vp_as_scissor.quant_mode));
-       if (initial_cdw != ctx->gfx_cs->current.cdw)
-               ctx->context_roll = true;
-
-       si_update_ngg_small_prim_precision(ctx);
+   const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer;
+   struct si_signed_scissor vp_as_scissor;
+   struct pipe_viewport_state vp;
+   float left, top, right, bottom, max_range, guardband_x, guardband_y;
+   float discard_x, discard_y;
+
+   if (ctx->vs_writes_viewport_index) {
+      /* Shaders can draw to any viewport. Make a union of all
+       * viewports. */
+      vp_as_scissor = ctx->viewports.as_scissor[0];
+      for (unsigned i = 1; i < SI_MAX_VIEWPORTS; i++) {
+         si_scissor_make_union(&vp_as_scissor, &ctx->viewports.as_scissor[i]);
+      }
+   } else {
+      vp_as_scissor = ctx->viewports.as_scissor[0];
+   }
+
+   /* Blits don't set the viewport state. The vertex shader determines
+    * the viewport size by scaling the coordinates, so we don't know
+    * how large the viewport is. Assume the worst case.
+    */
+   if (ctx->vs_disables_clipping_viewport)
+      vp_as_scissor.quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
+
+   /* Determine the optimal hardware screen offset to center the viewport
+    * within the viewport range in order to maximize the guardband size.
+    */
+   int hw_screen_offset_x = (vp_as_scissor.maxx + vp_as_scissor.minx) / 2;
+   int hw_screen_offset_y = (vp_as_scissor.maxy + vp_as_scissor.miny) / 2;
+
+   /* GFX6-GFX7 need to align the offset to an ubertile consisting of all SEs. */
+   const unsigned hw_screen_offset_alignment =
+      ctx->chip_class >= GFX8 ? 16 : MAX2(ctx->screen->se_tile_repeat, 16);
+
+   /* Indexed by quantization modes */
+   static int max_viewport_size[] = {65535, 16383, 4095};
+
+   /* Ensure that the whole viewport stays representable in
+    * absolute coordinates.
+    * See comment in si_set_viewport_states.
+    */
+   assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] &&
+          vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]);
+
+   hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
+   hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
+
+   /* Align the screen offset by dropping the low bits. */
+   hw_screen_offset_x &= ~(hw_screen_offset_alignment - 1);
+   hw_screen_offset_y &= ~(hw_screen_offset_alignment - 1);
+
+   /* Apply the offset to center the viewport and maximize the guardband. */
+   vp_as_scissor.minx -= hw_screen_offset_x;
+   vp_as_scissor.maxx -= hw_screen_offset_x;
+   vp_as_scissor.miny -= hw_screen_offset_y;
+   vp_as_scissor.maxy -= hw_screen_offset_y;
+
+   /* Reconstruct the viewport transformation from the scissor. */
+   vp.translate[0] = (vp_as_scissor.minx + vp_as_scissor.maxx) / 2.0;
+   vp.translate[1] = (vp_as_scissor.miny + vp_as_scissor.maxy) / 2.0;
+   vp.scale[0] = vp_as_scissor.maxx - vp.translate[0];
+   vp.scale[1] = vp_as_scissor.maxy - vp.translate[1];
+
+   /* Treat a 0x0 viewport as 1x1 to prevent division by zero. */
+   if (vp_as_scissor.minx == vp_as_scissor.maxx)
+      vp.scale[0] = 0.5;
+   if (vp_as_scissor.miny == vp_as_scissor.maxy)
+      vp.scale[1] = 0.5;
+
+   /* Find the biggest guard band that is inside the supported viewport
+    * range. The guard band is specified as a horizontal and vertical
+    * distance from (0,0) in clip space.
+    *
+    * This is done by applying the inverse viewport transformation
+    * on the viewport limits to get those limits in clip space.
+    *
+    * The viewport range is [-max_viewport_size/2, max_viewport_size/2].
+    */
+   assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size));
+   max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2;
+   left = (-max_range - vp.translate[0]) / vp.scale[0];
+   right = (max_range - vp.translate[0]) / vp.scale[0];
+   top = (-max_range - vp.translate[1]) / vp.scale[1];
+   bottom = (max_range - vp.translate[1]) / vp.scale[1];
+
+   assert(left <= -1 && top <= -1 && right >= 1 && bottom >= 1);
+
+   guardband_x = MIN2(-left, right);
+   guardband_y = MIN2(-top, bottom);
+
+   discard_x = 1.0;
+   discard_y = 1.0;
+
+   if (unlikely(util_prim_is_points_or_lines(ctx->current_rast_prim))) {
+      /* When rendering wide points or lines, we need to be more
+       * conservative about when to discard them entirely. */
+      float pixels;
+
+      if (ctx->current_rast_prim == PIPE_PRIM_POINTS)
+         pixels = rs->max_point_size;
+      else
+         pixels = rs->line_width;
+
+      /* Add half the point size / line width */
+      discard_x += pixels / (2.0 * vp.scale[0]);
+      discard_y += pixels / (2.0 * vp.scale[1]);
+
+      /* Discard primitives that would lie entirely outside the clip
+       * region. */
+      discard_x = MIN2(discard_x, guardband_x);
+      discard_y = MIN2(discard_y, guardband_y);
+   }
+
+   /* If any of the GB registers is updated, all of them must be updated.
+    * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ
+    * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ
+    */
+   unsigned initial_cdw = ctx->gfx_cs->current.cdw;
+   radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ,
+                               SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, fui(guardband_y), fui(discard_y),
+                               fui(guardband_x), fui(discard_x));
+   radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET,
+                              SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
+                              S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) |
+                                 S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4));
+   radeon_opt_set_context_reg(
+      ctx, R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL,
+      S_028BE4_PIX_CENTER(rs->half_pixel_center) |
+         S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode));
+   if (initial_cdw != ctx->gfx_cs->current.cdw)
+      ctx->context_roll = true;
+
+   si_update_ngg_small_prim_precision(ctx);
  }
  
  static void si_emit_scissors(struct si_context *ctx)
  {
-       struct radeon_cmdbuf *cs = ctx->gfx_cs;
-       struct pipe_scissor_state *states = ctx->scissors;
-       bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
-
-       /* The simple case: Only 1 viewport is active. */
-       if (!ctx->vs_writes_viewport_index) {
-               struct si_signed_scissor *vp = &ctx->viewports.as_scissor[0];
-
-               radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
-               si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL);
-               return;
-       }
-
-       /* All registers in the array need to be updated if any of them is changed.
-        * This is a hardware requirement.
-        */
-       radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL,
-                                  SI_MAX_VIEWPORTS * 2);
-       for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
-               si_emit_one_scissor(ctx, cs, &ctx->viewports.as_scissor[i],
-                                   scissor_enabled ? &states[i] : NULL);
-       }
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+   struct pipe_scissor_state *states = ctx->scissors;
+   bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
+
+   /* The simple case: Only 1 viewport is active. */
+   if (!ctx->vs_writes_viewport_index) {
+      struct si_signed_scissor *vp = &ctx->viewports.as_scissor[0];
+
+      radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
+      si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL);
+      return;
+   }
+
+   /* All registers in the array need to be updated if any of them is changed.
+    * This is a hardware requirement.
+    */
+   radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, SI_MAX_VIEWPORTS * 2);
+   for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
+      si_emit_one_scissor(ctx, cs, &ctx->viewports.as_scissor[i],
+                          scissor_enabled ? &states[i] : NULL);
+   }
  }
  
-static void si_set_viewport_states(struct pipe_context *pctx,
-                                  unsigned start_slot,
-                                  unsigned num_viewports,
-                                  const struct pipe_viewport_state *state)
+static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slot,
+                                   unsigned num_viewports, const struct pipe_viewport_state *state)
  {
-       struct si_context *ctx = (struct si_context *)pctx;
-       int i;
-
-       for (i = 0; i < num_viewports; i++) {
-               unsigned index = start_slot + i;
-               struct si_signed_scissor *scissor = &ctx->viewports.as_scissor[index];
-
-               ctx->viewports.states[index] = state[i];
-
-               si_get_scissor_from_viewport(ctx, &state[i], scissor);
-
-               unsigned w = scissor->maxx - scissor->minx;
-               unsigned h = scissor->maxy - scissor->miny;
-               unsigned max_extent = MAX2(w, h);
-
-               int max_corner = MAX2(scissor->maxx, scissor->maxy);
-
-               unsigned center_x = (scissor->maxx + scissor->minx) / 2;
-               unsigned center_y = (scissor->maxy + scissor->miny) / 2;
-               unsigned max_center = MAX2(center_x, center_y);
-
-               /* PA_SU_HARDWARE_SCREEN_OFFSET can't center viewports whose
-                * center start farther than MAX_PA_SU_HARDWARE_SCREEN_OFFSET.
-                * (for example, a 1x1 viewport in the lower right corner of
-                * 16Kx16K) Such viewports need a greater guardband, so they
-                * have to use a worse quantization mode.
-                */
-               unsigned distance_off_center =
-                       MAX2(0, (int)max_center - MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
-               max_extent += distance_off_center;
-
-               /* Determine the best quantization mode (subpixel precision),
-                * but also leave enough space for the guardband.
-                *
-                * Note that primitive binning requires QUANT_MODE == 16_8 on Vega10
-                * and Raven1 for line and rectangle primitive types to work correctly.
-                * Always use 16_8 if primitive binning is possible to occur.
-                */
-               if ((ctx->family == CHIP_VEGA10 || ctx->family == CHIP_RAVEN) &&
-                   ctx->screen->dpbb_allowed)
-                       max_extent = 16384; /* Use QUANT_MODE == 16_8. */
-
-               /* Another constraint is that all coordinates in the viewport
-                * are representable in fixed point with respect to the
-                * surface origin.
-                *
-                * It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given
-                * an offset that would make the upper corner of the viewport
-                * greater than the maximum representable number post
-                * quantization, ie 2^quant_bits.
-                *
-                * This does not matter for 14.10 and 16.8 formats since the
-                * offset is already limited at 8k, but it means we can't use
-                * 12.12 if we are drawing to some pixels outside the lower
-                * 4k x 4k of the render target.
-                */
-
-               if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline area for guardband */
-                       scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
-               else if (max_extent <= 4096) /* 16K scanline area for guardband */
-                       scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
-               else /* 64K scanline area for guardband */
-                       scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
-       }
-
-       if (start_slot == 0) {
-               ctx->viewports.y_inverted =
-                       -state->scale[1] + state->translate[1] >
-                       state->scale[1] + state->translate[1];
-       }
-
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+   struct si_context *ctx = (struct si_context *)pctx;
+   int i;
+
+   for (i = 0; i < num_viewports; i++) {
+      unsigned index = start_slot + i;
+      struct si_signed_scissor *scissor = &ctx->viewports.as_scissor[index];
+
+      ctx->viewports.states[index] = state[i];
+
+      si_get_scissor_from_viewport(ctx, &state[i], scissor);
+
+      unsigned w = scissor->maxx - scissor->minx;
+      unsigned h = scissor->maxy - scissor->miny;
+      unsigned max_extent = MAX2(w, h);
+
+      int max_corner = MAX2(scissor->maxx, scissor->maxy);
+
+      unsigned center_x = (scissor->maxx + scissor->minx) / 2;
+      unsigned center_y = (scissor->maxy + scissor->miny) / 2;
+      unsigned max_center = MAX2(center_x, center_y);
+
+      /* PA_SU_HARDWARE_SCREEN_OFFSET can't center viewports whose
+       * center start farther than MAX_PA_SU_HARDWARE_SCREEN_OFFSET.
+       * (for example, a 1x1 viewport in the lower right corner of
+       * 16Kx16K) Such viewports need a greater guardband, so they
+       * have to use a worse quantization mode.
+       */
+      unsigned distance_off_center = MAX2(0, (int)max_center - MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
+      max_extent += distance_off_center;
+
+      /* Determine the best quantization mode (subpixel precision),
+       * but also leave enough space for the guardband.
+       *
+       * Note that primitive binning requires QUANT_MODE == 16_8 on Vega10
+       * and Raven1 for line and rectangle primitive types to work correctly.
+       * Always use 16_8 if primitive binning is possible to occur.
+       */
+      if ((ctx->family == CHIP_VEGA10 || ctx->family == CHIP_RAVEN) && ctx->screen->dpbb_allowed)
+         max_extent = 16384; /* Use QUANT_MODE == 16_8. */
+
+      /* Another constraint is that all coordinates in the viewport
+       * are representable in fixed point with respect to the
+       * surface origin.
+       *
+       * It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given
+       * an offset that would make the upper corner of the viewport
+       * greater than the maximum representable number post
+       * quantization, ie 2^quant_bits.
+       *
+       * This does not matter for 14.10 and 16.8 formats since the
+       * offset is already limited at 8k, but it means we can't use
+       * 12.12 if we are drawing to some pixels outside the lower
+       * 4k x 4k of the render target.
+       */
+
+      if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline area for guardband */
+         scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
+      else if (max_extent <= 4096) /* 16K scanline area for guardband */
+         scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
+      else /* 64K scanline area for guardband */
+         scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
+   }
+
+   if (start_slot == 0) {
+      ctx->viewports.y_inverted =
+         -state->scale[1] + state->translate[1] > state->scale[1] + state->translate[1];
+   }
+
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
  }
  
-static void si_emit_one_viewport(struct si_context *ctx,
-                                struct pipe_viewport_state *state)
+static void si_emit_one_viewport(struct si_context *ctx, struct pipe_viewport_state *state)
  {
-       struct radeon_cmdbuf *cs = ctx->gfx_cs;
-
-       radeon_emit(cs, fui(state->scale[0]));
-       radeon_emit(cs, fui(state->translate[0]));
-       radeon_emit(cs, fui(state->scale[1]));
-       radeon_emit(cs, fui(state->translate[1]));
-       radeon_emit(cs, fui(state->scale[2]));
-       radeon_emit(cs, fui(state->translate[2]));
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+   radeon_emit(cs, fui(state->scale[0]));
+   radeon_emit(cs, fui(state->translate[0]));
+   radeon_emit(cs, fui(state->scale[1]));
+   radeon_emit(cs, fui(state->translate[1]));
+   radeon_emit(cs, fui(state->scale[2]));
+   radeon_emit(cs, fui(state->translate[2]));
  }
  
  static void si_emit_viewports(struct si_context *ctx)
  {
-       struct radeon_cmdbuf *cs = ctx->gfx_cs;
-       struct pipe_viewport_state *states = ctx->viewports.states;
-
-       if (ctx->screen->use_ngg_culling) {
-               /* Set the viewport info for small primitive culling. */
-               struct si_small_prim_cull_info info;
-               si_get_small_prim_cull_info(ctx, &info);
-
-               if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) {
-                       unsigned offset = 0;
-
-                       /* Align to 256, because the address is shifted by 8 bits. */
-                       u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256,
-                                     &info, &offset,
-                                     (struct pipe_resource**)&ctx->small_prim_cull_info_buf);
-
-                       ctx->small_prim_cull_info_address =
-                               ctx->small_prim_cull_info_buf->gpu_address + offset;
-                       ctx->last_small_prim_cull_info = info;
-                       ctx->small_prim_cull_info_dirty = true;
-               }
-
-               if (ctx->small_prim_cull_info_dirty) {
-                       /* This will end up in SGPR6 as (value << 8), shifted by the hw. */
-                       radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->small_prim_cull_info_buf,
-                                                 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
-                       radeon_set_sh_reg(ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
-                                         ctx->small_prim_cull_info_address >> 8);
-                       ctx->small_prim_cull_info_dirty = false;
-               }
-       }
-
-       /* The simple case: Only 1 viewport is active. */
-       if (!ctx->vs_writes_viewport_index) {
-               radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
-               si_emit_one_viewport(ctx, &states[0]);
-               return;
-       }
-
-       /* All registers in the array need to be updated if any of them is changed.
-        * This is a hardware requirement.
-        */
-       radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE +
-                                  0, SI_MAX_VIEWPORTS * 6);
-       for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++)
-               si_emit_one_viewport(ctx, &states[i]);
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+   struct pipe_viewport_state *states = ctx->viewports.states;
+
+   if (ctx->screen->use_ngg_culling) {
+      /* Set the viewport info for small primitive culling. */
+      struct si_small_prim_cull_info info;
+      si_get_small_prim_cull_info(ctx, &info);
+
+      if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) {
+         unsigned offset = 0;
+
+         /* Align to 256, because the address is shifted by 8 bits. */
+         u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset,
+                       (struct pipe_resource **)&ctx->small_prim_cull_info_buf);
+
+         ctx->small_prim_cull_info_address = ctx->small_prim_cull_info_buf->gpu_address + offset;
+         ctx->last_small_prim_cull_info = info;
+         ctx->small_prim_cull_info_dirty = true;
+      }
+
+      if (ctx->small_prim_cull_info_dirty) {
+         /* This will end up in SGPR6 as (value << 8), shifted by the hw. */
+         radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->small_prim_cull_info_buf,
+                                   RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+         radeon_set_sh_reg(ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
+                           ctx->small_prim_cull_info_address >> 8);
+         ctx->small_prim_cull_info_dirty = false;
+      }
+   }
+
+   /* The simple case: Only 1 viewport is active. */
+   if (!ctx->vs_writes_viewport_index) {
+      radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
+      si_emit_one_viewport(ctx, &states[0]);
+      return;
+   }
+
+   /* All registers in the array need to be updated if any of them is changed.
+    * This is a hardware requirement.
+    */
+   radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE + 0, SI_MAX_VIEWPORTS * 6);
+   for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++)
+      si_emit_one_viewport(ctx, &states[i]);
  }
  
-static inline void
-si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
-                     bool window_space_position, float *zmin, float *zmax)
+static inline void si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
+                                         bool window_space_position, float *zmin, float *zmax)
  {
-       if (window_space_position) {
-               *zmin = 0;
-               *zmax = 1;
-               return;
-       }
-       util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
+   if (window_space_position) {
+      *zmin = 0;
+      *zmax = 1;
+      return;
+   }
+   util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
  }
  
  static void si_emit_depth_ranges(struct si_context *ctx)
  {
-       struct radeon_cmdbuf *cs = ctx->gfx_cs;
-       struct pipe_viewport_state *states = ctx->viewports.states;
-       bool clip_halfz = ctx->queued.named.rasterizer->clip_halfz;
-       bool window_space = ctx->vs_disables_clipping_viewport;
-       float zmin, zmax;
-
-       /* The simple case: Only 1 viewport is active. */
-       if (!ctx->vs_writes_viewport_index) {
-               si_viewport_zmin_zmax(&states[0], clip_halfz, window_space,
-                                     &zmin, &zmax);
-
-               radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2);
-               radeon_emit(cs, fui(zmin));
-               radeon_emit(cs, fui(zmax));
-               return;
-       }
-
-       /* All registers in the array need to be updated if any of them is changed.
-        * This is a hardware requirement.
-        */
-       radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0,
-                                  SI_MAX_VIEWPORTS * 2);
-       for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
-               si_viewport_zmin_zmax(&states[i], clip_halfz, window_space,
-                                     &zmin, &zmax);
-               radeon_emit(cs, fui(zmin));
-               radeon_emit(cs, fui(zmax));
-       }
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+   struct pipe_viewport_state *states = ctx->viewports.states;
+   bool clip_halfz = ctx->queued.named.rasterizer->clip_halfz;
+   bool window_space = ctx->vs_disables_clipping_viewport;
+   float zmin, zmax;
+
+   /* The simple case: Only 1 viewport is active. */
+   if (!ctx->vs_writes_viewport_index) {
+      si_viewport_zmin_zmax(&states[0], clip_halfz, window_space, &zmin, &zmax);
+
+      radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2);
+      radeon_emit(cs, fui(zmin));
+      radeon_emit(cs, fui(zmax));
+      return;
+   }
+
+   /* All registers in the array need to be updated if any of them is changed.
+    * This is a hardware requirement.
+    */
+   radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, SI_MAX_VIEWPORTS * 2);
+   for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
+      si_viewport_zmin_zmax(&states[i], clip_halfz, window_space, &zmin, &zmax);
+      radeon_emit(cs, fui(zmin));
+      radeon_emit(cs, fui(zmax));
+   }
  }
  
  static void si_emit_viewport_states(struct si_context *ctx)
  {
-       si_emit_viewports(ctx);
-       si_emit_depth_ranges(ctx);
+   si_emit_viewports(ctx);
+   si_emit_depth_ranges(ctx);
  }
  
  /**
@@ -579,128 +550,112 @@ static void si_emit_viewport_states(struct si_context *ctx)
   */
  void si_update_vs_viewport_state(struct si_context *ctx)
  {
-       struct si_shader_info *info = si_get_vs_info(ctx);
-       bool vs_window_space;
-
-       if (!info)
-               return;
-
-       /* When the VS disables clipping and viewport transformation. */
-       vs_window_space =
-               info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-
-       if (ctx->vs_disables_clipping_viewport != vs_window_space) {
-               ctx->vs_disables_clipping_viewport = vs_window_space;
-               si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
-               si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
-       }
-
-       /* Viewport index handling. */
-       if (ctx->vs_writes_viewport_index == info->writes_viewport_index)
-               return;
-
-       /* This changes how the guardband is computed. */
-       ctx->vs_writes_viewport_index = info->writes_viewport_index;
-       si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
-
-       /* Emit scissors and viewports that were enabled by having
-        * the ViewportIndex output.
-        */
-       if (info->writes_viewport_index) {
-           si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
-           si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
-       }
+   struct si_shader_info *info = si_get_vs_info(ctx);
+   bool vs_window_space;
+
+   if (!info)
+      return;
+
+   /* When the VS disables clipping and viewport transformation. */
+   vs_window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+
+   if (ctx->vs_disables_clipping_viewport != vs_window_space) {
+      ctx->vs_disables_clipping_viewport = vs_window_space;
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+   }
+
+   /* Viewport index handling. */
+   if (ctx->vs_writes_viewport_index == info->writes_viewport_index)
+      return;
+
+   /* This changes how the guardband is computed. */
+   ctx->vs_writes_viewport_index = info->writes_viewport_index;
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+
+   /* Emit scissors and viewports that were enabled by having
+    * the ViewportIndex output.
+    */
+   if (info->writes_viewport_index) {
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+   }
  }
  
  static void si_emit_window_rectangles(struct si_context *sctx)
  {
-       /* There are four clipping rectangles. Their corner coordinates are inclusive.
-        * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
-        * on whether the pixel is inside cliprects 0-3, respectively. For example,
-        * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
-        * the number 3 (binary 0011).
-        *
-        * If CLIPRECT_RULE & (1 << number), the pixel is rasterized.
-        */
-       struct radeon_cmdbuf *cs = sctx->gfx_cs;
-       static const unsigned outside[4] = {
-               /* outside rectangle 0 */
-               V_02820C_OUT |
-               V_02820C_IN_1 |
-               V_02820C_IN_2 |
-               V_02820C_IN_21 |
-               V_02820C_IN_3 |
-               V_02820C_IN_31 |
-               V_02820C_IN_32 |
-               V_02820C_IN_321,
-               /* outside rectangles 0, 1 */
-               V_02820C_OUT |
-               V_02820C_IN_2 |
-               V_02820C_IN_3 |
-               V_02820C_IN_32,
-               /* outside rectangles 0, 1, 2 */
-               V_02820C_OUT |
-               V_02820C_IN_3,
-               /* outside rectangles 0, 1, 2, 3 */
-               V_02820C_OUT,
-       };
-       const unsigned disabled = 0xffff; /* all inside and outside cases */
-       unsigned num_rectangles = sctx->num_window_rectangles;
-       struct pipe_scissor_state *rects = sctx->window_rectangles;
-       unsigned rule;
-
-       assert(num_rectangles <= 4);
-
-       if (num_rectangles == 0)
-               rule = disabled;
-       else if (sctx->window_rectangles_include)
-               rule = ~outside[num_rectangles - 1];
-       else
-               rule = outside[num_rectangles - 1];
-
-       radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE,
-                                  SI_TRACKED_PA_SC_CLIPRECT_RULE, rule);
-       if (num_rectangles == 0)
-               return;
-
-       radeon_set_context_reg_seq(cs, R_028210_PA_SC_CLIPRECT_0_TL,
-                                  num_rectangles * 2);
-       for (unsigned i = 0; i < num_rectangles; i++) {
-               radeon_emit(cs, S_028210_TL_X(rects[i].minx) |
-                               S_028210_TL_Y(rects[i].miny));
-               radeon_emit(cs, S_028214_BR_X(rects[i].maxx) |
-                               S_028214_BR_Y(rects[i].maxy));
-       }
+   /* There are four clipping rectangles. Their corner coordinates are inclusive.
+    * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
+    * on whether the pixel is inside cliprects 0-3, respectively. For example,
+    * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
+    * the number 3 (binary 0011).
+    *
+    * If CLIPRECT_RULE & (1 << number), the pixel is rasterized.
+    */
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   static const unsigned outside[4] = {
+      /* outside rectangle 0 */
+      V_02820C_OUT | V_02820C_IN_1 | V_02820C_IN_2 | V_02820C_IN_21 | V_02820C_IN_3 |
+         V_02820C_IN_31 | V_02820C_IN_32 | V_02820C_IN_321,
+      /* outside rectangles 0, 1 */
+      V_02820C_OUT | V_02820C_IN_2 | V_02820C_IN_3 | V_02820C_IN_32,
+      /* outside rectangles 0, 1, 2 */
+      V_02820C_OUT | V_02820C_IN_3,
+      /* outside rectangles 0, 1, 2, 3 */
+      V_02820C_OUT,
+   };
+   const unsigned disabled = 0xffff; /* all inside and outside cases */
+   unsigned num_rectangles = sctx->num_window_rectangles;
+   struct pipe_scissor_state *rects = sctx->window_rectangles;
+   unsigned rule;
+
+   assert(num_rectangles <= 4);
+
+   if (num_rectangles == 0)
+      rule = disabled;
+   else if (sctx->window_rectangles_include)
+      rule = ~outside[num_rectangles - 1];
+   else
+      rule = outside[num_rectangles - 1];
+
+   radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE, SI_TRACKED_PA_SC_CLIPRECT_RULE,
+                              rule);
+   if (num_rectangles == 0)
+      return;
+
+   radeon_set_context_reg_seq(cs, R_028210_PA_SC_CLIPRECT_0_TL, num_rectangles * 2);
+   for (unsigned i = 0; i < num_rectangles; i++) {
+      radeon_emit(cs, S_028210_TL_X(rects[i].minx) | S_028210_TL_Y(rects[i].miny));
+      radeon_emit(cs, S_028214_BR_X(rects[i].maxx) | S_028214_BR_Y(rects[i].maxy));
+   }
  }
  
-static void si_set_window_rectangles(struct pipe_context *ctx,
-                                    bool include,
-                                    unsigned num_rectangles,
-                                    const struct pipe_scissor_state *rects)
+static void si_set_window_rectangles(struct pipe_context *ctx, bool include,
+                                     unsigned num_rectangles,
+                                     const struct pipe_scissor_state *rects)
  {
-       struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
  
-       sctx->num_window_rectangles = num_rectangles;
-       sctx->window_rectangles_include = include;
-       if (num_rectangles) {
-               memcpy(sctx->window_rectangles, rects,
-                      sizeof(*rects) * num_rectangles);
-       }
+   sctx->num_window_rectangles = num_rectangles;
+   sctx->window_rectangles_include = include;
+   if (num_rectangles) {
+      memcpy(sctx->window_rectangles, rects, sizeof(*rects) * num_rectangles);
+   }
  
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.window_rectangles);
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.window_rectangles);
  }
  
  void si_init_viewport_functions(struct si_context *ctx)
  {
-       ctx->atoms.s.guardband.emit = si_emit_guardband;
-       ctx->atoms.s.scissors.emit = si_emit_scissors;
-       ctx->atoms.s.viewports.emit = si_emit_viewport_states;
-       ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
+   ctx->atoms.s.guardband.emit = si_emit_guardband;
+   ctx->atoms.s.scissors.emit = si_emit_scissors;
+   ctx->atoms.s.viewports.emit = si_emit_viewport_states;
+   ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
  
-       ctx->b.set_scissor_states = si_set_scissor_states;
-       ctx->b.set_viewport_states = si_set_viewport_states;
-       ctx->b.set_window_rectangles = si_set_window_rectangles;
+   ctx->b.set_scissor_states = si_set_scissor_states;
+   ctx->b.set_viewport_states = si_set_viewport_states;
+   ctx->b.set_window_rectangles = si_set_window_rectangles;
  
-       for (unsigned i = 0; i < 16; i++)
-               ctx->viewports.as_scissor[i].quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
+   for (unsigned i = 0; i < 16; i++)
+      ctx->viewports.as_scissor[i].quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
  }
diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c

index f803448cfc605ab6290fc2876adf10045834fae9..7b4ecedbcba4e0bbbf796703b288c9ba6762c23f 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_test_dma.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma.c
@@ -26,8 +26,8 @@
  /* This file implements randomized SDMA texture blit tests. */
  
  #include "si_pipe.h"
-#include "util/u_surface.h"
  #include "util/rand_xor.h"
+#include "util/u_surface.h"
  
  static uint64_t seed_xorshift128plus[2];
  
@@ -36,382 +36,356 @@ static uint64_t seed_xorshift128plus[2];
  /* The GPU blits are emulated on the CPU using these CPU textures. */
  
  struct cpu_texture {
-       uint8_t *ptr;
-       uint64_t size;
-       uint64_t layer_stride;
-       unsigned stride;
+   uint8_t *ptr;
+   uint64_t size;
+   uint64_t layer_stride;
+   unsigned stride;
  };
  
-static void alloc_cpu_texture(struct cpu_texture *tex,
-                             struct pipe_resource *templ)
+static void alloc_cpu_texture(struct cpu_texture *tex, struct pipe_resource *templ)
  {
-       tex->stride = align(util_format_get_stride(templ->format, templ->width0),
-                           RAND_NUM_SIZE);
-       tex->layer_stride = (uint64_t)tex->stride * templ->height0;
-       tex->size = tex->layer_stride * templ->array_size;
-       tex->ptr = malloc(tex->size);
-       assert(tex->ptr);
+   tex->stride = align(util_format_get_stride(templ->format, templ->width0), RAND_NUM_SIZE);
+   tex->layer_stride = (uint64_t)tex->stride * templ->height0;
+   tex->size = tex->layer_stride * templ->array_size;
+   tex->ptr = malloc(tex->size);
+   assert(tex->ptr);
  }
  
-static void set_random_pixels(struct pipe_context *ctx,
-                             struct pipe_resource *tex,
-                             struct cpu_texture *cpu)
+static void set_random_pixels(struct pipe_context *ctx, struct pipe_resource *tex,
+                              struct cpu_texture *cpu)
  {
-       struct pipe_transfer *t;
-       uint8_t *map;
-       int x,y,z;
-
-       map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE,
-                                  0, 0, 0, tex->width0, tex->height0,
-                                  tex->array_size, &t);
-       assert(map);
-
-       for (z = 0; z < tex->array_size; z++) {
-               for (y = 0; y < tex->height0; y++) {
-                       uint64_t *ptr = (uint64_t*)
-                               (map + t->layer_stride*z + t->stride*y);
-                       uint64_t *ptr_cpu = (uint64_t*)
-                               (cpu->ptr + cpu->layer_stride*z + cpu->stride*y);
-                       unsigned size = cpu->stride / RAND_NUM_SIZE;
-
-                       assert(t->stride % RAND_NUM_SIZE == 0);
-                       assert(cpu->stride % RAND_NUM_SIZE == 0);
-
-                       for (x = 0; x < size; x++) {
-                               *ptr++ = *ptr_cpu++ =
-                                       rand_xorshift128plus(seed_xorshift128plus);
-                       }
-               }
-       }
-
-       pipe_transfer_unmap(ctx, t);
+   struct pipe_transfer *t;
+   uint8_t *map;
+   int x, y, z;
+
+   map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE, 0, 0, 0, tex->width0, tex->height0,
+                              tex->array_size, &t);
+   assert(map);
+
+   for (z = 0; z < tex->array_size; z++) {
+      for (y = 0; y < tex->height0; y++) {
+         uint64_t *ptr = (uint64_t *)(map + t->layer_stride * z + t->stride * y);
+         uint64_t *ptr_cpu = (uint64_t *)(cpu->ptr + cpu->layer_stride * z + cpu->stride * y);
+         unsigned size = cpu->stride / RAND_NUM_SIZE;
+
+         assert(t->stride % RAND_NUM_SIZE == 0);
+         assert(cpu->stride % RAND_NUM_SIZE == 0);
+
+         for (x = 0; x < size; x++) {
+            *ptr++ = *ptr_cpu++ = rand_xorshift128plus(seed_xorshift128plus);
+         }
+      }
+   }
+
+   pipe_transfer_unmap(ctx, t);
  }
  
-static bool compare_textures(struct pipe_context *ctx,
-                            struct pipe_resource *tex,
-                            struct cpu_texture *cpu)
+static bool compare_textures(struct pipe_context *ctx, struct pipe_resource *tex,
+                             struct cpu_texture *cpu)
  {
-       struct pipe_transfer *t;
-       uint8_t *map;
-       int y,z;
-       bool pass = true;
-       unsigned stride = util_format_get_stride(tex->format, tex->width0);
-
-       map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ,
-                                  0, 0, 0, tex->width0, tex->height0,
-                                  tex->array_size, &t);
-       assert(map);
-
-       for (z = 0; z < tex->array_size; z++) {
-               for (y = 0; y < tex->height0; y++) {
-                       uint8_t *ptr = map + t->layer_stride*z + t->stride*y;
-                       uint8_t *cpu_ptr = cpu->ptr +
-                                          cpu->layer_stride*z + cpu->stride*y;
-
-                       if (memcmp(ptr, cpu_ptr, stride)) {
-                               pass = false;
-                               goto done;
-                       }
-               }
-       }
+   struct pipe_transfer *t;
+   uint8_t *map;
+   int y, z;
+   bool pass = true;
+   unsigned stride = util_format_get_stride(tex->format, tex->width0);
+
+   map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ, 0, 0, 0, tex->width0, tex->height0,
+                              tex->array_size, &t);
+   assert(map);
+
+   for (z = 0; z < tex->array_size; z++) {
+      for (y = 0; y < tex->height0; y++) {
+         uint8_t *ptr = map + t->layer_stride * z + t->stride * y;
+         uint8_t *cpu_ptr = cpu->ptr + cpu->layer_stride * z + cpu->stride * y;
+
+         if (memcmp(ptr, cpu_ptr, stride)) {
+            pass = false;
+            goto done;
+         }
+      }
+   }
  done:
-       pipe_transfer_unmap(ctx, t);
-       return pass;
+   pipe_transfer_unmap(ctx, t);
+   return pass;
  }
  
  static enum pipe_format choose_format()
  {
-       enum pipe_format formats[] = {
-               PIPE_FORMAT_R8_UINT,
-               PIPE_FORMAT_R16_UINT,
-               PIPE_FORMAT_R32_UINT,
-               PIPE_FORMAT_R32G32_UINT,
-               PIPE_FORMAT_R32G32B32A32_UINT,
-               PIPE_FORMAT_G8R8_B8R8_UNORM,
-       };
-       return formats[rand() % ARRAY_SIZE(formats)];
+   enum pipe_format formats[] = {
+      PIPE_FORMAT_R8_UINT,     PIPE_FORMAT_R16_UINT,          PIPE_FORMAT_R32_UINT,
+      PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32B32A32_UINT, PIPE_FORMAT_G8R8_B8R8_UNORM,
+   };
+   return formats[rand() % ARRAY_SIZE(formats)];
  }
  
-static const char *array_mode_to_string(struct si_screen *sscreen,
-                                       struct radeon_surf *surf)
+static const char *array_mode_to_string(struct si_screen *sscreen, struct radeon_surf *surf)
  {
-       if (sscreen->info.chip_class >= GFX9) {
-               switch (surf->u.gfx9.surf.swizzle_mode) {
-               case 0:
-                       return "  LINEAR";
-               case 21:
-                       return " 4KB_S_X";
-               case 22:
-                       return " 4KB_D_X";
-               case 25:
-                       return "64KB_S_X";
-               case 26:
-                       return "64KB_D_X";
-               default:
-                       printf("Unhandled swizzle mode = %u\n",
-                              surf->u.gfx9.surf.swizzle_mode);
-                       return " UNKNOWN";
-               }
-       } else {
-               switch (surf->u.legacy.level[0].mode) {
-               case RADEON_SURF_MODE_LINEAR_ALIGNED:
-                       return "LINEAR_ALIGNED";
-               case RADEON_SURF_MODE_1D:
-                       return "1D_TILED_THIN1";
-               case RADEON_SURF_MODE_2D:
-                       return "2D_TILED_THIN1";
-               default:
-                       assert(0);
-                       return "       UNKNOWN";
-               }
-       }
+   if (sscreen->info.chip_class >= GFX9) {
+      switch (surf->u.gfx9.surf.swizzle_mode) {
+      case 0:
+         return "  LINEAR";
+      case 21:
+         return " 4KB_S_X";
+      case 22:
+         return " 4KB_D_X";
+      case 25:
+         return "64KB_S_X";
+      case 26:
+         return "64KB_D_X";
+      default:
+         printf("Unhandled swizzle mode = %u\n", surf->u.gfx9.surf.swizzle_mode);
+         return " UNKNOWN";
+      }
+   } else {
+      switch (surf->u.legacy.level[0].mode) {
+      case RADEON_SURF_MODE_LINEAR_ALIGNED:
+         return "LINEAR_ALIGNED";
+      case RADEON_SURF_MODE_1D:
+         return "1D_TILED_THIN1";
+      case RADEON_SURF_MODE_2D:
+         return "2D_TILED_THIN1";
+      default:
+         assert(0);
+         return "       UNKNOWN";
+      }
+   }
  }
  
  static unsigned generate_max_tex_side(unsigned max_tex_side)
  {
-       switch (rand() % 4) {
-       case 0:
-               /* Try to hit large sizes in 1/4 of the cases. */
-               return max_tex_side;
-       case 1:
-               /* Try to hit 1D tiling in 1/4 of the cases. */
-               return 128;
-       default:
-               /* Try to hit common sizes in 2/4 of the cases. */
-               return 2048;
-       }
+   switch (rand() % 4) {
+   case 0:
+      /* Try to hit large sizes in 1/4 of the cases. */
+      return max_tex_side;
+   case 1:
+      /* Try to hit 1D tiling in 1/4 of the cases. */
+      return 128;
+   default:
+      /* Try to hit common sizes in 2/4 of the cases. */
+      return 2048;
+   }
  }
  
  void si_test_dma(struct si_screen *sscreen)
  {
-       struct pipe_screen *screen = &sscreen->b;
-       struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
-       struct si_context *sctx = (struct si_context*)ctx;
-       uint64_t max_alloc_size;
-       unsigned i, iterations, num_partial_copies, max_tex_side;
-       unsigned num_pass = 0, num_fail = 0;
-
-       max_tex_side = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_SIZE);
-
-       /* Max 128 MB allowed for both textures. */
-       max_alloc_size = 128 * 1024 * 1024;
-
-       /* the seed for random test parameters */
-       srand(0x9b47d95b);
-       /* the seed for random pixel data */
-       s_rand_xorshift128plus(seed_xorshift128plus, false);
-
-       iterations = 1000000000; /* just kill it when you are bored */
-       num_partial_copies = 30;
-
-       /* These parameters are randomly generated per test:
-        * - whether to do one whole-surface copy or N partial copies per test
-        * - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D)
-        * - which texture dimensions to use
-        * - whether to use VRAM (all tiling modes) and GTT (staging, linear
-        *   only) allocations
-        * - random initial pixels in src
-        * - generate random subrectangle copies for partial blits
-        */
-       for (i = 0; i < iterations; i++) {
-               struct pipe_resource tsrc = {}, tdst = {}, *src, *dst;
-               struct si_texture *sdst;
-               struct si_texture *ssrc;
-               struct cpu_texture src_cpu, dst_cpu;
-               unsigned max_width, max_height, max_depth, j, num;
-               unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen;
-               unsigned max_tex_layers;
-               bool pass;
-               bool do_partial_copies = rand() & 1;
-
-               /* generate a random test case */
-               tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY;
-               tsrc.depth0 = tdst.depth0 = 1;
-
-               tsrc.format = tdst.format = choose_format();
-
-               max_tex_side_gen = generate_max_tex_side(max_tex_side);
-               max_tex_layers = rand() % 4 ? 1 : 5;
-
-               tsrc.width0 = (rand() % max_tex_side_gen) + 1;
-               tsrc.height0 = (rand() % max_tex_side_gen) + 1;
-               tsrc.array_size = (rand() % max_tex_layers) + 1;
-
-               if (tsrc.format == PIPE_FORMAT_G8R8_B8R8_UNORM)
-                       tsrc.width0 = align(tsrc.width0, 2);
-
-               /* Have a 1/4 chance of getting power-of-two dimensions. */
-               if (rand() % 4 == 0) {
-                       tsrc.width0 = util_next_power_of_two(tsrc.width0);
-                       tsrc.height0 = util_next_power_of_two(tsrc.height0);
-               }
-
-               if (!do_partial_copies) {
-                       /* whole-surface copies only, same dimensions */
-                       tdst = tsrc;
-               } else {
-                       max_tex_side_gen = generate_max_tex_side(max_tex_side);
-                       max_tex_layers = rand() % 4 ? 1 : 5;
-
-                       /* many partial copies, dimensions can be different */
-                       tdst.width0 = (rand() % max_tex_side_gen) + 1;
-                       tdst.height0 = (rand() % max_tex_side_gen) + 1;
-                       tdst.array_size = (rand() % max_tex_layers) + 1;
-
-                       /* Have a 1/4 chance of getting power-of-two dimensions. */
-                       if (rand() % 4 == 0) {
-                               tdst.width0 = util_next_power_of_two(tdst.width0);
-                               tdst.height0 = util_next_power_of_two(tdst.height0);
-                       }
-               }
-
-               /* check texture sizes */
-               if ((uint64_t) util_format_get_nblocks(tsrc.format, tsrc.width0, tsrc.height0)
-                       * tsrc.array_size * util_format_get_blocksize(tsrc.format) +
-                   (uint64_t) util_format_get_nblocks(tdst.format, tdst.width0, tdst.height0)
-                       * tdst.array_size * util_format_get_blocksize(tdst.format) >
-                   max_alloc_size) {
-                       /* too large, try again */
-                       i--;
-                       continue;
-               }
-
-               /* VRAM + the tiling mode depends on dimensions (3/4 of cases),
-                * or GTT + linear only (1/4 of cases)
-                */
-               tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
-               tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
-
-               /* Allocate textures (both the GPU and CPU copies).
-                * The CPU will emulate what the GPU should be doing.
-                */
-               src = screen->resource_create(screen, &tsrc);
-               dst = screen->resource_create(screen, &tdst);
-               assert(src);
-               assert(dst);
-               sdst = (struct si_texture*)dst;
-               ssrc = (struct si_texture*)src;
-               alloc_cpu_texture(&src_cpu, &tsrc);
-               alloc_cpu_texture(&dst_cpu, &tdst);
-
-               printf("%4u: dst = (%5u x %5u x %u, %s), "
-                      " src = (%5u x %5u x %u, %s), format = %s, ",
-                      i, tdst.width0, tdst.height0, tdst.array_size,
-                      array_mode_to_string(sscreen, &sdst->surface),
-                      tsrc.width0, tsrc.height0, tsrc.array_size,
-                      array_mode_to_string(sscreen, &ssrc->surface),
-                      util_format_description(tsrc.format)->name);
-               fflush(stdout);
-
-               /* set src pixels */
-               set_random_pixels(ctx, src, &src_cpu);
-
-               /* clear dst pixels */
-               uint32_t zero = 0;
-               si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
-                               SI_COHERENCY_SHADER, false);
-               memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
-
-               /* preparation */
-               max_width = MIN2(tsrc.width0, tdst.width0);
-               max_height = MIN2(tsrc.height0, tdst.height0);
-               max_depth = MIN2(tsrc.array_size, tdst.array_size);
-
-               num = do_partial_copies ? num_partial_copies : 1;
-               for (j = 0; j < num; j++) {
-                       int width, height, depth;
-                       int srcx, srcy, srcz, dstx, dsty, dstz;
-                       struct pipe_box box;
-                       unsigned old_num_draw_calls = sctx->num_draw_calls;
-                       unsigned old_num_dma_calls = sctx->num_dma_calls;
-                       unsigned old_num_cs_calls = sctx->num_compute_calls;
-
-                       if (!do_partial_copies) {
-                               /* copy whole src to dst */
-                               width = max_width;
-                               height = max_height;
-                               depth = max_depth;
-
-                               srcx = srcy = srcz = dstx = dsty = dstz = 0;
-                       } else {
-                               /* random sub-rectangle copies from src to dst */
-                               depth = (rand() % max_depth) + 1;
-                               srcz = rand() % (tsrc.array_size - depth + 1);
-                               dstz = rand() % (tdst.array_size - depth + 1);
-
-                               /* special code path to hit the tiled partial copies */
-                               if (!ssrc->surface.is_linear &&
-                                   !sdst->surface.is_linear &&
-                                   rand() & 1) {
-                                       if (max_width < 8 || max_height < 8)
-                                               continue;
-                                       width = ((rand() % (max_width / 8)) + 1) * 8;
-                                       height = ((rand() % (max_height / 8)) + 1) * 8;
-
-                                       srcx = rand() % (tsrc.width0 - width + 1) & ~0x7;
-                                       srcy = rand() % (tsrc.height0 - height + 1) & ~0x7;
-
-                                       dstx = rand() % (tdst.width0 - width + 1) & ~0x7;
-                                       dsty = rand() % (tdst.height0 - height + 1) & ~0x7;
-                               } else {
-                                       /* just make sure that it doesn't divide by zero */
-                                       assert(max_width > 0 && max_height > 0);
-
-                                       width = (rand() % max_width) + 1;
-                                       height = (rand() % max_height) + 1;
-
-                                       srcx = rand() % (tsrc.width0 - width + 1);
-                                       srcy = rand() % (tsrc.height0 - height + 1);
-
-                                       dstx = rand() % (tdst.width0 - width + 1);
-                                       dsty = rand() % (tdst.height0 - height + 1);
-                               }
-
-                               /* special code path to hit out-of-bounds reads in L2T */
-                               if (ssrc->surface.is_linear &&
-                                   !sdst->surface.is_linear &&
-                                   rand() % 4 == 0) {
-                                       srcx = 0;
-                                       srcy = 0;
-                                       srcz = 0;
-                               }
-                       }
-
-                       /* GPU copy */
-                       u_box_3d(srcx, srcy, srcz, width, height, depth, &box);
-                       sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box);
-
-                       /* See which engine was used. */
-                       gfx_blits += sctx->num_draw_calls > old_num_draw_calls;
-                       dma_blits += sctx->num_dma_calls > old_num_dma_calls;
-                       cs_blits  += sctx->num_compute_calls > old_num_cs_calls;
-
-                       /* CPU copy */
-                       util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride,
-                                     dst_cpu.layer_stride,
-                                     dstx, dsty, dstz, width, height, depth,
-                                     src_cpu.ptr, src_cpu.stride,
-                                     src_cpu.layer_stride,
-                                     srcx, srcy, srcz);
-               }
-
-               pass = compare_textures(ctx, dst, &dst_cpu);
-               if (pass)
-                       num_pass++;
-               else
-                       num_fail++;
-
-               printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n",
-                      gfx_blits, dma_blits, cs_blits, pass ? "pass" : "fail",
-                      num_pass, num_pass+num_fail);
-
-               /* cleanup */
-               pipe_resource_reference(&src, NULL);
-               pipe_resource_reference(&dst, NULL);
-               free(src_cpu.ptr);
-               free(dst_cpu.ptr);
-       }
-
-       ctx->destroy(ctx);
-       exit(0);
+   struct pipe_screen *screen = &sscreen->b;
+   struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+   struct si_context *sctx = (struct si_context *)ctx;
+   uint64_t max_alloc_size;
+   unsigned i, iterations, num_partial_copies, max_tex_side;
+   unsigned num_pass = 0, num_fail = 0;
+
+   max_tex_side = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_SIZE);
+
+   /* Max 128 MB allowed for both textures. */
+   max_alloc_size = 128 * 1024 * 1024;
+
+   /* the seed for random test parameters */
+   srand(0x9b47d95b);
+   /* the seed for random pixel data */
+   s_rand_xorshift128plus(seed_xorshift128plus, false);
+
+   iterations = 1000000000; /* just kill it when you are bored */
+   num_partial_copies = 30;
+
+   /* These parameters are randomly generated per test:
+    * - whether to do one whole-surface copy or N partial copies per test
+    * - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D)
+    * - which texture dimensions to use
+    * - whether to use VRAM (all tiling modes) and GTT (staging, linear
+    *   only) allocations
+    * - random initial pixels in src
+    * - generate random subrectangle copies for partial blits
+    */
+   for (i = 0; i < iterations; i++) {
+      struct pipe_resource tsrc = {}, tdst = {}, *src, *dst;
+      struct si_texture *sdst;
+      struct si_texture *ssrc;
+      struct cpu_texture src_cpu, dst_cpu;
+      unsigned max_width, max_height, max_depth, j, num;
+      unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen;
+      unsigned max_tex_layers;
+      bool pass;
+      bool do_partial_copies = rand() & 1;
+
+      /* generate a random test case */
+      tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY;
+      tsrc.depth0 = tdst.depth0 = 1;
+
+      tsrc.format = tdst.format = choose_format();
+
+      max_tex_side_gen = generate_max_tex_side(max_tex_side);
+      max_tex_layers = rand() % 4 ? 1 : 5;
+
+      tsrc.width0 = (rand() % max_tex_side_gen) + 1;
+      tsrc.height0 = (rand() % max_tex_side_gen) + 1;
+      tsrc.array_size = (rand() % max_tex_layers) + 1;
+
+      if (tsrc.format == PIPE_FORMAT_G8R8_B8R8_UNORM)
+         tsrc.width0 = align(tsrc.width0, 2);
+
+      /* Have a 1/4 chance of getting power-of-two dimensions. */
+      if (rand() % 4 == 0) {
+         tsrc.width0 = util_next_power_of_two(tsrc.width0);
+         tsrc.height0 = util_next_power_of_two(tsrc.height0);
+      }
+
+      if (!do_partial_copies) {
+         /* whole-surface copies only, same dimensions */
+         tdst = tsrc;
+      } else {
+         max_tex_side_gen = generate_max_tex_side(max_tex_side);
+         max_tex_layers = rand() % 4 ? 1 : 5;
+
+         /* many partial copies, dimensions can be different */
+         tdst.width0 = (rand() % max_tex_side_gen) + 1;
+         tdst.height0 = (rand() % max_tex_side_gen) + 1;
+         tdst.array_size = (rand() % max_tex_layers) + 1;
+
+         /* Have a 1/4 chance of getting power-of-two dimensions. */
+         if (rand() % 4 == 0) {
+            tdst.width0 = util_next_power_of_two(tdst.width0);
+            tdst.height0 = util_next_power_of_two(tdst.height0);
+         }
+      }
+
+      /* check texture sizes */
+      if ((uint64_t)util_format_get_nblocks(tsrc.format, tsrc.width0, tsrc.height0) *
+                tsrc.array_size * util_format_get_blocksize(tsrc.format) +
+             (uint64_t)util_format_get_nblocks(tdst.format, tdst.width0, tdst.height0) *
+                tdst.array_size * util_format_get_blocksize(tdst.format) >
+          max_alloc_size) {
+         /* too large, try again */
+         i--;
+         continue;
+      }
+
+      /* VRAM + the tiling mode depends on dimensions (3/4 of cases),
+       * or GTT + linear only (1/4 of cases)
+       */
+      tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
+      tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
+
+      /* Allocate textures (both the GPU and CPU copies).
+       * The CPU will emulate what the GPU should be doing.
+       */
+      src = screen->resource_create(screen, &tsrc);
+      dst = screen->resource_create(screen, &tdst);
+      assert(src);
+      assert(dst);
+      sdst = (struct si_texture *)dst;
+      ssrc = (struct si_texture *)src;
+      alloc_cpu_texture(&src_cpu, &tsrc);
+      alloc_cpu_texture(&dst_cpu, &tdst);
+
+      printf("%4u: dst = (%5u x %5u x %u, %s), "
+             " src = (%5u x %5u x %u, %s), format = %s, ",
+             i, tdst.width0, tdst.height0, tdst.array_size,
+             array_mode_to_string(sscreen, &sdst->surface), tsrc.width0, tsrc.height0,
+             tsrc.array_size, array_mode_to_string(sscreen, &ssrc->surface),
+             util_format_description(tsrc.format)->name);
+      fflush(stdout);
+
+      /* set src pixels */
+      set_random_pixels(ctx, src, &src_cpu);
+
+      /* clear dst pixels */
+      uint32_t zero = 0;
+      si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4, SI_COHERENCY_SHADER, false);
+      memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
+
+      /* preparation */
+      max_width = MIN2(tsrc.width0, tdst.width0);
+      max_height = MIN2(tsrc.height0, tdst.height0);
+      max_depth = MIN2(tsrc.array_size, tdst.array_size);
+
+      num = do_partial_copies ? num_partial_copies : 1;
+      for (j = 0; j < num; j++) {
+         int width, height, depth;
+         int srcx, srcy, srcz, dstx, dsty, dstz;
+         struct pipe_box box;
+         unsigned old_num_draw_calls = sctx->num_draw_calls;
+         unsigned old_num_dma_calls = sctx->num_dma_calls;
+         unsigned old_num_cs_calls = sctx->num_compute_calls;
+
+         if (!do_partial_copies) {
+            /* copy whole src to dst */
+            width = max_width;
+            height = max_height;
+            depth = max_depth;
+
+            srcx = srcy = srcz = dstx = dsty = dstz = 0;
+         } else {
+            /* random sub-rectangle copies from src to dst */
+            depth = (rand() % max_depth) + 1;
+            srcz = rand() % (tsrc.array_size - depth + 1);
+            dstz = rand() % (tdst.array_size - depth + 1);
+
+            /* special code path to hit the tiled partial copies */
+            if (!ssrc->surface.is_linear && !sdst->surface.is_linear && rand() & 1) {
+               if (max_width < 8 || max_height < 8)
+                  continue;
+               width = ((rand() % (max_width / 8)) + 1) * 8;
+               height = ((rand() % (max_height / 8)) + 1) * 8;
+
+               srcx = rand() % (tsrc.width0 - width + 1) & ~0x7;
+               srcy = rand() % (tsrc.height0 - height + 1) & ~0x7;
+
+               dstx = rand() % (tdst.width0 - width + 1) & ~0x7;
+               dsty = rand() % (tdst.height0 - height + 1) & ~0x7;
+            } else {
+               /* just make sure that it doesn't divide by zero */
+               assert(max_width > 0 && max_height > 0);
+
+               width = (rand() % max_width) + 1;
+               height = (rand() % max_height) + 1;
+
+               srcx = rand() % (tsrc.width0 - width + 1);
+               srcy = rand() % (tsrc.height0 - height + 1);
+
+               dstx = rand() % (tdst.width0 - width + 1);
+               dsty = rand() % (tdst.height0 - height + 1);
+            }
+
+            /* special code path to hit out-of-bounds reads in L2T */
+            if (ssrc->surface.is_linear && !sdst->surface.is_linear && rand() % 4 == 0) {
+               srcx = 0;
+               srcy = 0;
+               srcz = 0;
+            }
+         }
+
+         /* GPU copy */
+         u_box_3d(srcx, srcy, srcz, width, height, depth, &box);
+         sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box);
+
+         /* See which engine was used. */
+         gfx_blits += sctx->num_draw_calls > old_num_draw_calls;
+         dma_blits += sctx->num_dma_calls > old_num_dma_calls;
+         cs_blits += sctx->num_compute_calls > old_num_cs_calls;
+
+         /* CPU copy */
+         util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride, dst_cpu.layer_stride, dstx, dsty,
+                       dstz, width, height, depth, src_cpu.ptr, src_cpu.stride,
+                       src_cpu.layer_stride, srcx, srcy, srcz);
+      }
+
+      pass = compare_textures(ctx, dst, &dst_cpu);
+      if (pass)
+         num_pass++;
+      else
+         num_fail++;
+
+      printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n", gfx_blits, dma_blits, cs_blits,
+             pass ? "pass" : "fail", num_pass, num_pass + num_fail);
+
+      /* cleanup */
+      pipe_resource_reference(&src, NULL);
+      pipe_resource_reference(&dst, NULL);
+      free(src_cpu.ptr);
+      free(dst_cpu.ptr);
+   }
+
+   ctx->destroy(ctx);
+   exit(0);
  }
diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c

index 4eec3d124594bfd9d9e1659c494dae55a7ddd90b..116bfe690693d49b491e581f26573cd98a1d0fe8 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -28,451 +28,444 @@
  #include "si_pipe.h"
  #include "si_query.h"
  
-#define MIN_SIZE       512
-#define MAX_SIZE       (128 * 1024 * 1024)
-#define SIZE_SHIFT     1
-#define NUM_RUNS       128
+#define MIN_SIZE   512
+#define MAX_SIZE   (128 * 1024 * 1024)
+#define SIZE_SHIFT 1
+#define NUM_RUNS   128
  
  static double get_MBps_rate(unsigned num_bytes, unsigned ns)
  {
-       return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
+   return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
  }
  
  void si_test_dma_perf(struct si_screen *sscreen)
  {
-       struct pipe_screen *screen = &sscreen->b;
-       struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
-       struct si_context *sctx = (struct si_context*)ctx;
-       const uint32_t clear_value = 0x12345678;
-       static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
-       static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
+   struct pipe_screen *screen = &sscreen->b;
+   struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+   struct si_context *sctx = (struct si_context *)ctx;
+   const uint32_t clear_value = 0x12345678;
+   static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
+   static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
  
  #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
-#define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
-
-       static const char *method_str[] = {
-               "CP MC   ",
-               "CP L2   ",
-               "CP L2   ",
-               "SDMA    ",
-       };
-       static const char *placement_str[] = {
-               /* Clear */
-               "fill->VRAM",
-               "fill->GTT ",
-               /* Copy */
-               "VRAM->VRAM",
-               "VRAM->GTT ",
-               "GTT ->VRAM",
-       };
-
-       printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
-       printf("Heap       ,Method  ,L2p,Wa,");
-       for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-               if (size >= 1024)
-                       printf("%6uKB,", size / 1024);
-               else
-                       printf(" %6uB,", size);
-       }
-       printf("\n");
-
-       /* results[log2(size)][placement][method][] */
-       struct si_result {
-               bool is_valid;
-               bool is_cp;
-               bool is_sdma;
-               bool is_cs;
-               unsigned cache_policy;
-               unsigned dwords_per_thread;
-               unsigned waves_per_sh;
-               unsigned score;
-               unsigned index; /* index in results[x][y][index] */
-       } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
-
-       /* Run benchmarks. */
-       for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
-               bool is_copy = placement >= 2;
-
-               printf("-----------,--------,---,--,");
-               for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
-                       printf("--------,");
-               printf("\n");
-
-               for (unsigned method = 0; method < NUM_METHODS; method++) {
-                       bool test_cp = method <= 2;
-                       bool test_sdma = method == 3;
-                       bool test_cs = method >= 4;
-                       unsigned cs_method = method - 4;
-                       STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
-                       unsigned cs_waves_per_sh =
-                               test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0;
-                       cs_method %= 2*NUM_SHADERS;
-                       unsigned cache_policy = test_cp ? method % 3 :
-                                               test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
-                       unsigned cs_dwords_per_thread =
-                               test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
-
-                       if (test_sdma && !sctx->sdma_cs)
-                               continue;
-
-                       if (sctx->chip_class == GFX6) {
-                               /* GFX6 doesn't support CP DMA operations through L2. */
-                               if (test_cp && cache_policy != L2_BYPASS)
-                                       continue;
-                               /* WAVES_PER_SH is in multiples of 16 on GFX6. */
-                               if (test_cs && cs_waves_per_sh % 16 != 0)
-                                       continue;
-                       }
-
-                       printf("%s ,", placement_str[placement]);
-                       if (test_cs) {
-                               printf("CS x%-4u,%3s,", cs_dwords_per_thread,
-                                      cache_policy == L2_LRU ? "LRU" :
-                                      cache_policy == L2_STREAM ? "Str" : "");
-                       } else {
-                               printf("%s,%3s,", method_str[method],
-                                      method == L2_LRU ? "LRU" :
-                                      method == L2_STREAM ? "Str" : "");
-                       }
-                       if (test_cs && cs_waves_per_sh)
-                               printf("%2u,", cs_waves_per_sh);
-                       else
-                               printf("  ,");
-
-                       double score = 0;
-                       for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-                               /* Don't test bigger sizes if it's too slow. Print 0. */
-                               if (size >= 512*1024 &&
-                                   score < 400 * (size / (4*1024*1024))) {
-                                       printf("%7.0f ,", 0.0);
-                                       continue;
-                               }
-
-                               enum pipe_resource_usage dst_usage, src_usage;
-                               struct pipe_resource *dst, *src;
-                               struct pipe_query *q[NUM_RUNS];
-                               unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
-
-                               if (test_sdma) {
-                                       if (sctx->chip_class == GFX6)
-                                               query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
-                                       else
-                                               query_type = SI_QUERY_TIME_ELAPSED_SDMA;
-                               }
-
-                               if (placement == 0 || placement == 2 || placement == 4)
-                                       dst_usage = PIPE_USAGE_DEFAULT;
-                               else
-                                       dst_usage = PIPE_USAGE_STREAM;
-
-                               if (placement == 2 || placement == 3)
-                                       src_usage = PIPE_USAGE_DEFAULT;
-                               else
-                                       src_usage = PIPE_USAGE_STREAM;
-
-                               dst = pipe_buffer_create(screen, 0, dst_usage, size);
-                               src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
-
-                               /* Run tests. */
-                               for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
-                                       q[iter] = ctx->create_query(ctx, query_type, 0);
-                                       ctx->begin_query(ctx, q[iter]);
-
-                                       if (test_cp) {
-                                               /* CP DMA */
-                                               if (is_copy) {
-                                                       si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0,
-                                                                             SI_COHERENCY_NONE, cache_policy);
-                                               } else {
-                                                       si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size,
-                                                                              clear_value, 0,
-                                                                              SI_COHERENCY_NONE, cache_policy);
-                                               }
-                                       } else if (test_sdma) {
-                                               /* SDMA */
-                                               if (is_copy) {
-                                                       si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
-                                               } else {
-                                                       si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
-                                               }
-                                       } else {
-                                               /* Compute */
-                                               /* The memory accesses are coalesced, meaning that the 1st instruction writes
-                                                * the 1st contiguous block of data for the whole wave, the 2nd instruction
-                                                * writes the 2nd contiguous block of data, etc.
-                                                */
-                                               unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
-                                               unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
-                                               unsigned dwords_per_wave = cs_dwords_per_thread * 64;
-
-                                               unsigned num_dwords = size / 4;
-                                               unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
-
-                                               void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
-                                                                                       cache_policy == L2_STREAM, is_copy);
-
-                                               struct pipe_grid_info info = {};
-                                               info.block[0] = MIN2(64, num_instructions);
-                                               info.block[1] = 1;
-                                               info.block[2] = 1;
-                                               info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
-                                               info.grid[1] = 1;
-                                               info.grid[2] = 1;
-
-                                               struct pipe_shader_buffer sb[2] = {};
-                                               sb[0].buffer = dst;
-                                               sb[0].buffer_size = size;
-
-                                               if (is_copy) {
-                                                       sb[1].buffer = src;
-                                                       sb[1].buffer_size = size;
-                                               } else {
-                                                       for (unsigned i = 0; i < 4; i++)
-                                                               sctx->cs_user_data[i] = clear_value;
-                                               }
-
-                                               sctx->flags |= SI_CONTEXT_INV_VCACHE |
-                                                              SI_CONTEXT_INV_SCACHE;
-
-                                               ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0,
-                                                                       is_copy ? 2 : 1, sb, 0x1);
-                                               ctx->bind_compute_state(ctx, cs);
-                                               sctx->cs_max_waves_per_sh = cs_waves_per_sh;
-
-                                               ctx->launch_grid(ctx, &info);
-
-                                               ctx->bind_compute_state(ctx, NULL);
-                                               ctx->delete_compute_state(ctx, cs);
-                                               sctx->cs_max_waves_per_sh = 0; /* disable the limit */
-
-                                               sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-                                       }
-
-                                       /* Flush L2, so that we don't just test L2 cache performance. */
-                                       if (!test_sdma) {
-                                               sctx->flags |= SI_CONTEXT_WB_L2;
-                                               sctx->emit_cache_flush(sctx);
-                                       }
-
-                                       ctx->end_query(ctx, q[iter]);
-                                       ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
-                               }
-                               pipe_resource_reference(&dst, NULL);
-                               pipe_resource_reference(&src, NULL);
-
-                               /* Get results. */
-                               uint64_t min = ~0ull, max = 0, total = 0;
-
-                               for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
-                                       union pipe_query_result result;
-
-                                       ctx->get_query_result(ctx, q[iter], true, &result);
-                                       ctx->destroy_query(ctx, q[iter]);
-
-                                       min = MIN2(min, result.u64);
-                                       max = MAX2(max, result.u64);
-                                       total += result.u64;
-                               }
-
-                               score = get_MBps_rate(size, total / (double)NUM_RUNS);
-                               printf("%7.0f ,", score);
-                               fflush(stdout);
-
-                               struct si_result *r = &results[util_logbase2(size)][placement][method];
-                               r->is_valid = true;
-                               r->is_cp = test_cp;
-                               r->is_sdma = test_sdma;
-                               r->is_cs = test_cs;
-                               r->cache_policy = cache_policy;
-                               r->dwords_per_thread = cs_dwords_per_thread;
-                               r->waves_per_sh = cs_waves_per_sh;
-                               r->score = score;
-                               r->index = method;
-                       }
-                       puts("");
-               }
-       }
-
-       puts("");
-       puts("static struct si_method");
-       printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
-              sctx->screen->info.name);
-       puts("{");
-       puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
-
-       /* Analyze results and find the best methods. */
-       for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
-               if (placement == 0)
-                       puts("   if (dst == RADEON_DOMAIN_VRAM) {");
-               else if (placement == 1)
-                       puts("   } else { /* GTT */");
-               else if (placement == 2) {
-                       puts("}");
-                       puts("");
-                       puts("static struct si_method");
-                       printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
-                              sctx->screen->info.name);
-                       printf("                     uint64_t size64, bool async, bool cached)\n");
-                       puts("{");
-                       puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
-                       puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
-               } else if (placement == 3)
-                       puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
-               else
-                       puts("   } else { /* GTT -> VRAM */");
-
-               for (unsigned mode = 0; mode < 3; mode++) {
-                       bool async = mode == 0;
-                       bool cached = mode == 1;
-
-                       if (async)
-                               puts("      if (async) { /* SDMA or async compute */");
-                       else if (cached)
-                               puts("      if (cached) { /* gfx ring */");
-                       else
-                               puts("      } else { /* gfx ring - uncached */");
-
-                       /* The list of best chosen methods. */
-                       struct si_result *methods[32];
-                       unsigned method_max_size[32];
-                       unsigned num_methods = 0;
-
-                       for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-                               /* Find the best method. */
-                               struct si_result *best = NULL;
-
-                               for (unsigned i = 0; i < NUM_METHODS; i++) {
-                                       struct si_result *r = &results[util_logbase2(size)][placement][i];
-
-                                       if (!r->is_valid)
-                                               continue;
-
-                                       /* Ban CP DMA clears via MC on <= GFX8. They are super slow
-                                        * on GTT, which we can get due to BO evictions.
-                                        */
-                                       if (sctx->chip_class <= GFX8 && placement == 1 &&
-                                           r->is_cp && r->cache_policy == L2_BYPASS)
-                                               continue;
-
-                                       if (async) {
-                                               /* The following constraints for compute IBs try to limit
-                                                * resource usage so as not to decrease the performance
-                                                * of gfx IBs too much.
-                                                */
-
-                                               /* Don't use CP DMA on asynchronous rings, because
-                                                * the engine is shared with gfx IBs.
-                                                */
-                                               if (r->is_cp)
-                                                       continue;
-
-                                               /* Don't use L2 caching on asynchronous rings to minimize
-                                                * L2 usage.
-                                                */
-                                               if (r->cache_policy == L2_LRU)
-                                                       continue;
-
-                                               /* Asynchronous compute recommends waves_per_sh != 0
-                                                * to limit CU usage. */
-                                               if (r->is_cs && r->waves_per_sh == 0)
-                                                       continue;
-                                       } else {
-                                               /* SDMA is always asynchronous */
-                                               if (r->is_sdma)
-                                                       continue;
-
-                                               if (cached && r->cache_policy == L2_BYPASS)
-                                                       continue;
-                                               if (!cached && r->cache_policy == L2_LRU)
-                                                       continue;
-                                       }
-
-                                       if (!best) {
-                                               best = r;
-                                               continue;
-                                       }
-
-                                       /* Assume some measurement error. Earlier methods occupy fewer
-                                        * resources, so the next method is always more greedy, and we
-                                        * don't want to select it due to a measurement error.
-                                        */
-                                       double min_improvement = 1.03;
-
-                                       if (best->score * min_improvement < r->score)
-                                               best = r;
-                               }
-
-                               if (num_methods > 0) {
-                                       unsigned prev_index = num_methods - 1;
-                                       struct si_result *prev = methods[prev_index];
-                                       struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index];
-
-                                       /* If the best one is also the best for the previous size,
-                                        * just bump the size for the previous one.
-                                        *
-                                        * If there is no best, it means all methods were too slow
-                                        * for this size and were not tested. Use the best one for
-                                        * the previous size.
-                                        */
-                                       if (!best ||
-                                           /* If it's the same method as for the previous size: */
-                                           (prev->is_cp == best->is_cp &&
-                                            prev->is_sdma == best->is_sdma &&
-                                            prev->is_cs == best->is_cs &&
-                                            prev->cache_policy == best->cache_policy &&
-                                            prev->dwords_per_thread == best->dwords_per_thread &&
-                                            prev->waves_per_sh == best->waves_per_sh) ||
-                                           /* If the method for the previous size is also the best
-                                            * for this size: */
-                                           (prev_this_size->is_valid &&
-                                            prev_this_size->score * 1.03 > best->score)) {
-                                               method_max_size[prev_index] = size;
-                                               continue;
-                                       }
-                               }
-
-                               /* Add it to the list. */
-                               assert(num_methods < ARRAY_SIZE(methods));
-                               methods[num_methods] = best;
-                               method_max_size[num_methods] = size;
-                               num_methods++;
-                       }
-
-                       for (unsigned i = 0; i < num_methods; i++) {
-                               struct si_result *best = methods[i];
-                               unsigned size = method_max_size[i];
-
-                               /* The size threshold is between the current benchmarked
-                                * size and the next benchmarked size. */
-                               if (i < num_methods - 1)
-                                       printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
-                               else if (i > 0)
-                                       printf("         else                   ");
-                               else
-                                       printf("         ");
-                               printf("return ");
-
-                               assert(best);
-                               if (best->is_cp) {
-                                       printf("CP_DMA(%s);\n",
-                                              best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
-                                              best->cache_policy == L2_LRU ?    "L2_LRU   " : "L2_STREAM");
-                               }
-                               if (best->is_sdma)
-                                       printf("SDMA;\n");
-                               if (best->is_cs) {
-                                       printf("COMPUTE(%s, %u, %u);\n",
-                                              best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM",
-                                              best->dwords_per_thread,
-                                              best->waves_per_sh);
-                               }
-                       }
-               }
-               puts("      }");
-       }
-       puts("   }");
-       puts("}");
-
-       ctx->destroy(ctx);
-       exit(0);
+#define NUM_METHODS (4 + 2 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
+
+   static const char *method_str[] = {
+      "CP MC   ",
+      "CP L2   ",
+      "CP L2   ",
+      "SDMA    ",
+   };
+   static const char *placement_str[] = {
+      /* Clear */
+      "fill->VRAM",
+      "fill->GTT ",
+      /* Copy */
+      "VRAM->VRAM",
+      "VRAM->GTT ",
+      "GTT ->VRAM",
+   };
+
+   printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
+   printf("Heap       ,Method  ,L2p,Wa,");
+   for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+      if (size >= 1024)
+         printf("%6uKB,", size / 1024);
+      else
+         printf(" %6uB,", size);
+   }
+   printf("\n");
+
+   /* results[log2(size)][placement][method][] */
+   struct si_result {
+      bool is_valid;
+      bool is_cp;
+      bool is_sdma;
+      bool is_cs;
+      unsigned cache_policy;
+      unsigned dwords_per_thread;
+      unsigned waves_per_sh;
+      unsigned score;
+      unsigned index; /* index in results[x][y][index] */
+   } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
+
+   /* Run benchmarks. */
+   for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+      bool is_copy = placement >= 2;
+
+      printf("-----------,--------,---,--,");
+      for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
+         printf("--------,");
+      printf("\n");
+
+      for (unsigned method = 0; method < NUM_METHODS; method++) {
+         bool test_cp = method <= 2;
+         bool test_sdma = method == 3;
+         bool test_cs = method >= 4;
+         unsigned cs_method = method - 4;
+         STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
+         unsigned cs_waves_per_sh =
+            test_cs ? cs_waves_per_sh_list[cs_method / (2 * NUM_SHADERS)] : 0;
+         cs_method %= 2 * NUM_SHADERS;
+         unsigned cache_policy =
+            test_cp ? method % 3 : test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
+         unsigned cs_dwords_per_thread =
+            test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
+
+         if (test_sdma && !sctx->sdma_cs)
+            continue;
+
+         if (sctx->chip_class == GFX6) {
+            /* GFX6 doesn't support CP DMA operations through L2. */
+            if (test_cp && cache_policy != L2_BYPASS)
+               continue;
+            /* WAVES_PER_SH is in multiples of 16 on GFX6. */
+            if (test_cs && cs_waves_per_sh % 16 != 0)
+               continue;
+         }
+
+         printf("%s ,", placement_str[placement]);
+         if (test_cs) {
+            printf("CS x%-4u,%3s,", cs_dwords_per_thread,
+                   cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
+         } else {
+            printf("%s,%3s,", method_str[method],
+                   method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
+         }
+         if (test_cs && cs_waves_per_sh)
+            printf("%2u,", cs_waves_per_sh);
+         else
+            printf("  ,");
+
+         double score = 0;
+         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+            /* Don't test bigger sizes if it's too slow. Print 0. */
+            if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
+               printf("%7.0f ,", 0.0);
+               continue;
+            }
+
+            enum pipe_resource_usage dst_usage, src_usage;
+            struct pipe_resource *dst, *src;
+            struct pipe_query *q[NUM_RUNS];
+            unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
+
+            if (test_sdma) {
+               if (sctx->chip_class == GFX6)
+                  query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
+               else
+                  query_type = SI_QUERY_TIME_ELAPSED_SDMA;
+            }
+
+            if (placement == 0 || placement == 2 || placement == 4)
+               dst_usage = PIPE_USAGE_DEFAULT;
+            else
+               dst_usage = PIPE_USAGE_STREAM;
+
+            if (placement == 2 || placement == 3)
+               src_usage = PIPE_USAGE_DEFAULT;
+            else
+               src_usage = PIPE_USAGE_STREAM;
+
+            dst = pipe_buffer_create(screen, 0, dst_usage, size);
+            src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
+
+            /* Run tests. */
+            for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+               q[iter] = ctx->create_query(ctx, query_type, 0);
+               ctx->begin_query(ctx, q[iter]);
+
+               if (test_cp) {
+                  /* CP DMA */
+                  if (is_copy) {
+                     si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, SI_COHERENCY_NONE,
+                                           cache_policy);
+                  } else {
+                     si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, clear_value, 0,
+                                            SI_COHERENCY_NONE, cache_policy);
+                  }
+               } else if (test_sdma) {
+                  /* SDMA */
+                  if (is_copy) {
+                     si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
+                  } else {
+                     si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
+                  }
+               } else {
+                  /* Compute */
+                  /* The memory accesses are coalesced, meaning that the 1st instruction writes
+                   * the 1st contiguous block of data for the whole wave, the 2nd instruction
+                   * writes the 2nd contiguous block of data, etc.
+                   */
+                  unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
+                  unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
+                  unsigned dwords_per_wave = cs_dwords_per_thread * 64;
+
+                  unsigned num_dwords = size / 4;
+                  unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+                  void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
+                                                          cache_policy == L2_STREAM, is_copy);
+
+                  struct pipe_grid_info info = {};
+                  info.block[0] = MIN2(64, num_instructions);
+                  info.block[1] = 1;
+                  info.block[2] = 1;
+                  info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+                  info.grid[1] = 1;
+                  info.grid[2] = 1;
+
+                  struct pipe_shader_buffer sb[2] = {};
+                  sb[0].buffer = dst;
+                  sb[0].buffer_size = size;
+
+                  if (is_copy) {
+                     sb[1].buffer = src;
+                     sb[1].buffer_size = size;
+                  } else {
+                     for (unsigned i = 0; i < 4; i++)
+                        sctx->cs_user_data[i] = clear_value;
+                  }
+
+                  sctx->flags |= SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_SCACHE;
+
+                  ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
+                  ctx->bind_compute_state(ctx, cs);
+                  sctx->cs_max_waves_per_sh = cs_waves_per_sh;
+
+                  ctx->launch_grid(ctx, &info);
+
+                  ctx->bind_compute_state(ctx, NULL);
+                  ctx->delete_compute_state(ctx, cs);
+                  sctx->cs_max_waves_per_sh = 0; /* disable the limit */
+
+                  sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+               }
+
+               /* Flush L2, so that we don't just test L2 cache performance. */
+               if (!test_sdma) {
+                  sctx->flags |= SI_CONTEXT_WB_L2;
+                  sctx->emit_cache_flush(sctx);
+               }
+
+               ctx->end_query(ctx, q[iter]);
+               ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
+            }
+            pipe_resource_reference(&dst, NULL);
+            pipe_resource_reference(&src, NULL);
+
+            /* Get results. */
+            uint64_t min = ~0ull, max = 0, total = 0;
+
+            for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+               union pipe_query_result result;
+
+               ctx->get_query_result(ctx, q[iter], true, &result);
+               ctx->destroy_query(ctx, q[iter]);
+
+               min = MIN2(min, result.u64);
+               max = MAX2(max, result.u64);
+               total += result.u64;
+            }
+
+            score = get_MBps_rate(size, total / (double)NUM_RUNS);
+            printf("%7.0f ,", score);
+            fflush(stdout);
+
+            struct si_result *r = &results[util_logbase2(size)][placement][method];
+            r->is_valid = true;
+            r->is_cp = test_cp;
+            r->is_sdma = test_sdma;
+            r->is_cs = test_cs;
+            r->cache_policy = cache_policy;
+            r->dwords_per_thread = cs_dwords_per_thread;
+            r->waves_per_sh = cs_waves_per_sh;
+            r->score = score;
+            r->index = method;
+         }
+         puts("");
+      }
+   }
+
+   puts("");
+   puts("static struct si_method");
+   printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
+          "cached)\n",
+          sctx->screen->info.name);
+   puts("{");
+   puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+
+   /* Analyze results and find the best methods. */
+   for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+      if (placement == 0)
+         puts("   if (dst == RADEON_DOMAIN_VRAM) {");
+      else if (placement == 1)
+         puts("   } else { /* GTT */");
+      else if (placement == 2) {
+         puts("}");
+         puts("");
+         puts("static struct si_method");
+         printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
+                sctx->screen->info.name);
+         printf("                     uint64_t size64, bool async, bool cached)\n");
+         puts("{");
+         puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+         puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
+      } else if (placement == 3)
+         puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
+      else
+         puts("   } else { /* GTT -> VRAM */");
+
+      for (unsigned mode = 0; mode < 3; mode++) {
+         bool async = mode == 0;
+         bool cached = mode == 1;
+
+         if (async)
+            puts("      if (async) { /* SDMA or async compute */");
+         else if (cached)
+            puts("      if (cached) { /* gfx ring */");
+         else
+            puts("      } else { /* gfx ring - uncached */");
+
+         /* The list of best chosen methods. */
+         struct si_result *methods[32];
+         unsigned method_max_size[32];
+         unsigned num_methods = 0;
+
+         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+            /* Find the best method. */
+            struct si_result *best = NULL;
+
+            for (unsigned i = 0; i < NUM_METHODS; i++) {
+               struct si_result *r = &results[util_logbase2(size)][placement][i];
+
+               if (!r->is_valid)
+                  continue;
+
+               /* Ban CP DMA clears via MC on <= GFX8. They are super slow
+                * on GTT, which we can get due to BO evictions.
+                */
+               if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp &&
+                   r->cache_policy == L2_BYPASS)
+                  continue;
+
+               if (async) {
+                  /* The following constraints for compute IBs try to limit
+                   * resource usage so as not to decrease the performance
+                   * of gfx IBs too much.
+                   */
+
+                  /* Don't use CP DMA on asynchronous rings, because
+                   * the engine is shared with gfx IBs.
+                   */
+                  if (r->is_cp)
+                     continue;
+
+                  /* Don't use L2 caching on asynchronous rings to minimize
+                   * L2 usage.
+                   */
+                  if (r->cache_policy == L2_LRU)
+                     continue;
+
+                  /* Asynchronous compute recommends waves_per_sh != 0
+                   * to limit CU usage. */
+                  if (r->is_cs && r->waves_per_sh == 0)
+                     continue;
+               } else {
+                  /* SDMA is always asynchronous */
+                  if (r->is_sdma)
+                     continue;
+
+                  if (cached && r->cache_policy == L2_BYPASS)
+                     continue;
+                  if (!cached && r->cache_policy == L2_LRU)
+                     continue;
+               }
+
+               if (!best) {
+                  best = r;
+                  continue;
+               }
+
+               /* Assume some measurement error. Earlier methods occupy fewer
+                * resources, so the next method is always more greedy, and we
+                * don't want to select it due to a measurement error.
+                */
+               double min_improvement = 1.03;
+
+               if (best->score * min_improvement < r->score)
+                  best = r;
+            }
+
+            if (num_methods > 0) {
+               unsigned prev_index = num_methods - 1;
+               struct si_result *prev = methods[prev_index];
+               struct si_result *prev_this_size =
+                  &results[util_logbase2(size)][placement][prev->index];
+
+               /* If the best one is also the best for the previous size,
+                * just bump the size for the previous one.
+                *
+                * If there is no best, it means all methods were too slow
+                * for this size and were not tested. Use the best one for
+                * the previous size.
+                */
+               if (!best ||
+                   /* If it's the same method as for the previous size: */
+                   (prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma &&
+                    prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
+                    prev->dwords_per_thread == best->dwords_per_thread &&
+                    prev->waves_per_sh == best->waves_per_sh) ||
+                   /* If the method for the previous size is also the best
+                    * for this size: */
+                   (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
+                  method_max_size[prev_index] = size;
+                  continue;
+               }
+            }
+
+            /* Add it to the list. */
+            assert(num_methods < ARRAY_SIZE(methods));
+            methods[num_methods] = best;
+            method_max_size[num_methods] = size;
+            num_methods++;
+         }
+
+         for (unsigned i = 0; i < num_methods; i++) {
+            struct si_result *best = methods[i];
+            unsigned size = method_max_size[i];
+
+            /* The size threshold is between the current benchmarked
+             * size and the next benchmarked size. */
+            if (i < num_methods - 1)
+               printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
+            else if (i > 0)
+               printf("         else                   ");
+            else
+               printf("         ");
+            printf("return ");
+
+            assert(best);
+            if (best->is_cp) {
+               printf("CP_DMA(%s);\n",
+                      best->cache_policy == L2_BYPASS
+                         ? "L2_BYPASS"
+                         : best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM");
+            }
+            if (best->is_sdma)
+               printf("SDMA;\n");
+            if (best->is_cs) {
+               printf("COMPUTE(%s, %u, %u);\n",
+                      best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM",
+                      best->dwords_per_thread, best->waves_per_sh);
+            }
+         }
+      }
+      puts("      }");
+   }
+   puts("   }");
+   puts("}");
+
+   ctx->destroy(ctx);
+   exit(0);
  }
diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c

index bcf9187082bceea68b1767aaf55158c6172a7451..4f7744a887df3b4756e6859c670d71303b9f537f 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_texture.c
+++ b/src/gallium/drivers/radeonsi/si_texture.c
@@ -23,462 +23,419 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
+#include "drm-uapi/drm_fourcc.h"
  #include "si_pipe.h"
  #include "si_query.h"
+#include "sid.h"
+#include "state_tracker/drm_driver.h"
  #include "util/format/u_format.h"
+#include "util/os_time.h"
  #include "util/u_log.h"
  #include "util/u_memory.h"
  #include "util/u_pack_color.h"
  #include "util/u_resource.h"
  #include "util/u_surface.h"
  #include "util/u_transfer.h"
-#include "util/os_time.h"
+
  #include <errno.h>
  #include <inttypes.h>
-#include "state_tracker/drm_driver.h"
-#include "sid.h"
-#include "amd/addrlib/inc/addrinterface.h"
-#include "drm-uapi/drm_fourcc.h"
  
-static enum radeon_surf_mode
-si_choose_tiling(struct si_screen *sscreen,
-                const struct pipe_resource *templ, bool tc_compatible_htile);
+#include "amd/addrlib/inc/addrinterface.h"
  
+static enum radeon_surf_mode si_choose_tiling(struct si_screen *sscreen,
+                                              const struct pipe_resource *templ,
+                                              bool tc_compatible_htile);
  
-bool si_prepare_for_dma_blit(struct si_context *sctx,
-                            struct si_texture *dst,
-                            unsigned dst_level, unsigned dstx,
-                            unsigned dsty, unsigned dstz,
-                            struct si_texture *src,
-                            unsigned src_level,
-                            const struct pipe_box *src_box)
+bool si_prepare_for_dma_blit(struct si_context *sctx, struct si_texture *dst, unsigned dst_level,
+                             unsigned dstx, unsigned dsty, unsigned dstz, struct si_texture *src,
+                             unsigned src_level, const struct pipe_box *src_box)
  {
-       if (!sctx->sdma_cs)
-               return false;
-
-       if (dst->surface.bpe != src->surface.bpe)
-               return false;
-
-       /* MSAA: Blits don't exist in the real world. */
-       if (src->buffer.b.b.nr_samples > 1 ||
-           dst->buffer.b.b.nr_samples > 1)
-               return false;
-
-       /* Depth-stencil surfaces:
-        *   When dst is linear, the DB->CB copy preserves HTILE.
-        *   When dst is tiled, the 3D path must be used to update HTILE.
-        */
-       if (src->is_depth || dst->is_depth)
-               return false;
-
-       /* DCC as:
-        *   src: Use the 3D path. DCC decompression is expensive.
-        *   dst: Use the 3D path to compress the pixels with DCC.
-        */
-       if (vi_dcc_enabled(src, src_level) ||
-           vi_dcc_enabled(dst, dst_level))
-               return false;
-
-       /* CMASK as:
-        *   src: Both texture and SDMA paths need decompression. Use SDMA.
-        *   dst: If overwriting the whole texture, discard CMASK and use
-        *        SDMA. Otherwise, use the 3D path.
-        */
-       if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) {
-               /* The CMASK clear is only enabled for the first level. */
-               assert(dst_level == 0);
-               if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level,
-                                                     dstx, dsty, dstz, src_box->width,
-                                                     src_box->height, src_box->depth))
-                       return false;
-
-               si_texture_discard_cmask(sctx->screen, dst);
-       }
-
-       /* All requirements are met. Prepare textures for SDMA. */
-       if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level))
-               sctx->b.flush_resource(&sctx->b, &src->buffer.b.b);
-
-       assert(!(src->dirty_level_mask & (1 << src_level)));
-       assert(!(dst->dirty_level_mask & (1 << dst_level)));
-
-       return true;
+   if (!sctx->sdma_cs)
+      return false;
+
+   if (dst->surface.bpe != src->surface.bpe)
+      return false;
+
+   /* MSAA: Blits don't exist in the real world. */
+   if (src->buffer.b.b.nr_samples > 1 || dst->buffer.b.b.nr_samples > 1)
+      return false;
+
+   /* Depth-stencil surfaces:
+    *   When dst is linear, the DB->CB copy preserves HTILE.
+    *   When dst is tiled, the 3D path must be used to update HTILE.
+    */
+   if (src->is_depth || dst->is_depth)
+      return false;
+
+   /* DCC as:
+    *   src: Use the 3D path. DCC decompression is expensive.
+    *   dst: Use the 3D path to compress the pixels with DCC.
+    */
+   if (vi_dcc_enabled(src, src_level) || vi_dcc_enabled(dst, dst_level))
+      return false;
+
+   /* CMASK as:
+    *   src: Both texture and SDMA paths need decompression. Use SDMA.
+    *   dst: If overwriting the whole texture, discard CMASK and use
+    *        SDMA. Otherwise, use the 3D path.
+    */
+   if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) {
+      /* The CMASK clear is only enabled for the first level. */
+      assert(dst_level == 0);
+      if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level, dstx, dsty, dstz,
+                                            src_box->width, src_box->height, src_box->depth))
+         return false;
+
+      si_texture_discard_cmask(sctx->screen, dst);
+   }
+
+   /* All requirements are met. Prepare textures for SDMA. */
+   if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level))
+      sctx->b.flush_resource(&sctx->b, &src->buffer.b.b);
+
+   assert(!(src->dirty_level_mask & (1 << src_level)));
+   assert(!(dst->dirty_level_mask & (1 << dst_level)));
+
+   return true;
  }
  
  /* Same as resource_copy_region, except that both upsampling and downsampling are allowed. */
-static void si_copy_region_with_blit(struct pipe_context *pipe,
-                                    struct pipe_resource *dst,
-                                    unsigned dst_level,
-                                    unsigned dstx, unsigned dsty, unsigned dstz,
-                                    struct pipe_resource *src,
-                                    unsigned src_level,
-                                    const struct pipe_box *src_box)
+static void si_copy_region_with_blit(struct pipe_context *pipe, struct pipe_resource *dst,
+                                     unsigned dst_level, unsigned dstx, unsigned dsty,
+                                     unsigned dstz, struct pipe_resource *src, unsigned src_level,
+                                     const struct pipe_box *src_box)
  {
-       struct pipe_blit_info blit;
-
-       memset(&blit, 0, sizeof(blit));
-       blit.src.resource = src;
-       blit.src.format = src->format;
-       blit.src.level = src_level;
-       blit.src.box = *src_box;
-       blit.dst.resource = dst;
-       blit.dst.format = dst->format;
-       blit.dst.level = dst_level;
-       blit.dst.box.x = dstx;
-       blit.dst.box.y = dsty;
-       blit.dst.box.z = dstz;
-       blit.dst.box.width = src_box->width;
-       blit.dst.box.height = src_box->height;
-       blit.dst.box.depth = src_box->depth;
-       blit.mask = util_format_get_mask(dst->format);
-       blit.filter = PIPE_TEX_FILTER_NEAREST;
-
-       if (blit.mask) {
-               pipe->blit(pipe, &blit);
-       }
+   struct pipe_blit_info blit;
+
+   memset(&blit, 0, sizeof(blit));
+   blit.src.resource = src;
+   blit.src.format = src->format;
+   blit.src.level = src_level;
+   blit.src.box = *src_box;
+   blit.dst.resource = dst;
+   blit.dst.format = dst->format;
+   blit.dst.level = dst_level;
+   blit.dst.box.x = dstx;
+   blit.dst.box.y = dsty;
+   blit.dst.box.z = dstz;
+   blit.dst.box.width = src_box->width;
+   blit.dst.box.height = src_box->height;
+   blit.dst.box.depth = src_box->depth;
+   blit.mask = util_format_get_mask(dst->format);
+   blit.filter = PIPE_TEX_FILTER_NEAREST;
+
+   if (blit.mask) {
+      pipe->blit(pipe, &blit);
+   }
  }
  
  /* Copy from a full GPU texture to a transfer's staging one. */
  static void si_copy_to_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
-       struct pipe_resource *dst = &stransfer->staging->b.b;
-       struct pipe_resource *src = transfer->resource;
-
-       if (src->nr_samples > 1 || ((struct si_texture*)src)->is_depth) {
-               si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0,
-                                          src, transfer->level, &transfer->box);
-               return;
-       }
-
-       sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level,
-                      &transfer->box);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_transfer *transfer = (struct pipe_transfer *)stransfer;
+   struct pipe_resource *dst = &stransfer->staging->b.b;
+   struct pipe_resource *src = transfer->resource;
+
+   if (src->nr_samples > 1 || ((struct si_texture *)src)->is_depth) {
+      si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0, src, transfer->level, &transfer->box);
+      return;
+   }
+
+   sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level, &transfer->box);
  }
  
  /* Copy from a transfer's staging texture to a full GPU one. */
  static void si_copy_from_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
-       struct pipe_resource *dst = transfer->resource;
-       struct pipe_resource *src = &stransfer->staging->b.b;
-       struct pipe_box sbox;
-
-       u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox);
-
-       if (dst->nr_samples > 1 || ((struct si_texture*)dst)->is_depth) {
-               si_copy_region_with_blit(ctx, dst, transfer->level,
-                                          transfer->box.x, transfer->box.y, transfer->box.z,
-                                          src, 0, &sbox);
-               return;
-       }
-
-       if (util_format_is_compressed(dst->format)) {
-               sbox.width = util_format_get_nblocksx(dst->format, sbox.width);
-               sbox.height = util_format_get_nblocksx(dst->format, sbox.height);
-       }
-
-       sctx->dma_copy(ctx, dst, transfer->level,
-                      transfer->box.x, transfer->box.y, transfer->box.z,
-                      src, 0, &sbox);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_transfer *transfer = (struct pipe_transfer *)stransfer;
+   struct pipe_resource *dst = transfer->resource;
+   struct pipe_resource *src = &stransfer->staging->b.b;
+   struct pipe_box sbox;
+
+   u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox);
+
+   if (dst->nr_samples > 1 || ((struct si_texture *)dst)->is_depth) {
+      si_copy_region_with_blit(ctx, dst, transfer->level, transfer->box.x, transfer->box.y,
+                               transfer->box.z, src, 0, &sbox);
+      return;
+   }
+
+   if (util_format_is_compressed(dst->format)) {
+      sbox.width = util_format_get_nblocksx(dst->format, sbox.width);
+      sbox.height = util_format_get_nblocksx(dst->format, sbox.height);
+   }
+
+   sctx->dma_copy(ctx, dst, transfer->level, transfer->box.x, transfer->box.y, transfer->box.z, src,
+                  0, &sbox);
  }
  
-static unsigned si_texture_get_offset(struct si_screen *sscreen,
-                                     struct si_texture *tex, unsigned level,
-                                     const struct pipe_box *box,
-                                     unsigned *stride,
-                                     unsigned *layer_stride)
+static unsigned si_texture_get_offset(struct si_screen *sscreen, struct si_texture *tex,
+                                      unsigned level, const struct pipe_box *box, unsigned *stride,
+                                      unsigned *layer_stride)
  {
-       if (sscreen->info.chip_class >= GFX9) {
-               *stride = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
-               *layer_stride = tex->surface.u.gfx9.surf_slice_size;
-
-               if (!box)
-                       return 0;
-
-               /* Each texture is an array of slices. Each slice is an array
-                * of mipmap levels. */
-               return tex->surface.u.gfx9.surf_offset +
-                      box->z * tex->surface.u.gfx9.surf_slice_size +
-                      tex->surface.u.gfx9.offset[level] +
-                      (box->y / tex->surface.blk_h *
-                       tex->surface.u.gfx9.surf_pitch +
-                       box->x / tex->surface.blk_w) * tex->surface.bpe;
-       } else {
-               *stride = tex->surface.u.legacy.level[level].nblk_x *
-                         tex->surface.bpe;
-               assert((uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 <= UINT_MAX);
-               *layer_stride = (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4;
-
-               if (!box)
-                       return tex->surface.u.legacy.level[level].offset;
-
-               /* Each texture is an array of mipmap levels. Each level is
-                * an array of slices. */
-               return tex->surface.u.legacy.level[level].offset +
-                      box->z * (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 +
-                      (box->y / tex->surface.blk_h *
-                       tex->surface.u.legacy.level[level].nblk_x +
-                       box->x / tex->surface.blk_w) * tex->surface.bpe;
-       }
+   if (sscreen->info.chip_class >= GFX9) {
+      *stride = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
+      *layer_stride = tex->surface.u.gfx9.surf_slice_size;
+
+      if (!box)
+         return 0;
+
+      /* Each texture is an array of slices. Each slice is an array
+       * of mipmap levels. */
+      return tex->surface.u.gfx9.surf_offset + box->z * tex->surface.u.gfx9.surf_slice_size +
+             tex->surface.u.gfx9.offset[level] +
+             (box->y / tex->surface.blk_h * tex->surface.u.gfx9.surf_pitch +
+              box->x / tex->surface.blk_w) *
+                tex->surface.bpe;
+   } else {
+      *stride = tex->surface.u.legacy.level[level].nblk_x * tex->surface.bpe;
+      assert((uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 <= UINT_MAX);
+      *layer_stride = (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4;
+
+      if (!box)
+         return tex->surface.u.legacy.level[level].offset;
+
+      /* Each texture is an array of mipmap levels. Each level is
+       * an array of slices. */
+      return tex->surface.u.legacy.level[level].offset +
+             box->z * (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 +
+             (box->y / tex->surface.blk_h * tex->surface.u.legacy.level[level].nblk_x +
+              box->x / tex->surface.blk_w) *
+                tex->surface.bpe;
+   }
  }
  
-static int si_init_surface(struct si_screen *sscreen,
-                          struct radeon_surf *surface,
-                          const struct pipe_resource *ptex,
-                          enum radeon_surf_mode array_mode,
-                          unsigned pitch_in_bytes_override,
-                          bool is_imported,
-                          bool is_scanout,
-                          bool is_flushed_depth,
-                          bool tc_compatible_htile)
+static int si_init_surface(struct si_screen *sscreen, struct radeon_surf *surface,
+                           const struct pipe_resource *ptex, enum radeon_surf_mode array_mode,
+                           unsigned pitch_in_bytes_override, bool is_imported, bool is_scanout,
+                           bool is_flushed_depth, bool tc_compatible_htile)
  {
-       const struct util_format_description *desc =
-               util_format_description(ptex->format);
-       bool is_depth, is_stencil;
-       int r;
-       unsigned bpe, flags = 0;
-
-       is_depth = util_format_has_depth(desc);
-       is_stencil = util_format_has_stencil(desc);
-
-       if (!is_flushed_depth &&
-           ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
-               bpe = 4; /* stencil is allocated separately */
-       } else {
-               bpe = util_format_get_blocksize(ptex->format);
-               assert(util_is_power_of_two_or_zero(bpe));
-       }
-
-       if (!is_flushed_depth && is_depth) {
-               flags |= RADEON_SURF_ZBUFFER;
-
-               if (sscreen->debug_flags & DBG(NO_HYPERZ)) {
-                       flags |= RADEON_SURF_NO_HTILE;
-               } else if (tc_compatible_htile &&
-                          (sscreen->info.chip_class >= GFX9 ||
-                           array_mode == RADEON_SURF_MODE_2D)) {
-                       /* TC-compatible HTILE only supports Z32_FLOAT.
-                        * GFX9 also supports Z16_UNORM.
-                        * On GFX8, promote Z16 to Z32. DB->CB copies will convert
-                        * the format for transfers.
-                        */
-                       if (sscreen->info.chip_class == GFX8)
-                               bpe = 4;
-
-                       flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
-               }
-
-               if (is_stencil)
-                       flags |= RADEON_SURF_SBUFFER;
-       }
-
-       if (sscreen->info.chip_class >= GFX8 &&
-           (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC ||
-            ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT ||
-            (ptex->nr_samples >= 2 && !sscreen->dcc_msaa_allowed)))
-               flags |= RADEON_SURF_DISABLE_DCC;
-
-       /* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */
-       if (sscreen->info.family == CHIP_STONEY &&
-           bpe == 16 && ptex->nr_samples >= 2)
-               flags |= RADEON_SURF_DISABLE_DCC;
-
-       /* GFX8: DCC clear for 4x and 8x MSAA array textures unimplemented. */
-       if (sscreen->info.chip_class == GFX8 &&
-           ptex->nr_storage_samples >= 4 &&
-           ptex->array_size > 1)
-               flags |= RADEON_SURF_DISABLE_DCC;
-
-       /* GFX9: DCC clear for 4x and 8x MSAA textures unimplemented. */
-       if (sscreen->info.chip_class == GFX9 &&
-           (ptex->nr_storage_samples >= 4 ||
-            (sscreen->info.family == CHIP_RAVEN &&
-             ptex->nr_storage_samples >= 2 && bpe < 4)))
-               flags |= RADEON_SURF_DISABLE_DCC;
-
-       /* TODO: GFX10: DCC causes corruption with MSAA. */
-       if (sscreen->info.chip_class >= GFX10 &&
-           ptex->nr_storage_samples >= 2)
-               flags |= RADEON_SURF_DISABLE_DCC;
-
-       /* Shared textures must always set up DCC.
-        * If it's not present, it will be disabled by
-        * si_get_opaque_metadata later.
-        */
-       if (!is_imported && (sscreen->debug_flags & DBG(NO_DCC)))
-               flags |= RADEON_SURF_DISABLE_DCC;
-
-       if (is_scanout) {
-               /* This should catch bugs in gallium users setting incorrect flags. */
-               assert(ptex->nr_samples <= 1 &&
-                      ptex->array_size == 1 &&
-                      ptex->depth0 == 1 &&
-                      ptex->last_level == 0 &&
-                      !(flags & RADEON_SURF_Z_OR_SBUFFER));
-
-               flags |= RADEON_SURF_SCANOUT;
-       }
-
-       if (ptex->bind & PIPE_BIND_SHARED)
-               flags |= RADEON_SURF_SHAREABLE;
-       if (is_imported)
-               flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE;
-       if (!(ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING))
-               flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE;
-       if (sscreen->debug_flags & DBG(NO_FMASK))
-               flags |= RADEON_SURF_NO_FMASK;
-
-       if (sscreen->info.chip_class == GFX9 &&
-           (ptex->flags & SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE)) {
-               flags |= RADEON_SURF_FORCE_MICRO_TILE_MODE;
-               surface->micro_tile_mode = SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(ptex->flags);
-       }
-
-       if (sscreen->info.chip_class >= GFX10 &&
-           (ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING)) {
-               flags |= RADEON_SURF_FORCE_SWIZZLE_MODE;
-               surface->u.gfx9.surf.swizzle_mode = ADDR_SW_64KB_R_X;
-       }
-
-       r = sscreen->ws->surface_init(sscreen->ws, ptex, flags, bpe,
-                                     array_mode, surface);
-       if (r) {
-               return r;
-       }
-
-       unsigned pitch = pitch_in_bytes_override / bpe;
-
-       if (sscreen->info.chip_class >= GFX9) {
-               if (pitch) {
-                       surface->u.gfx9.surf_pitch = pitch;
-                       if (ptex->last_level == 0)
-                               surface->u.gfx9.surf.epitch = pitch - 1;
-                       surface->u.gfx9.surf_slice_size =
-                               (uint64_t)pitch * surface->u.gfx9.surf_height * bpe;
-               }
-       } else {
-               if (pitch) {
-                       surface->u.legacy.level[0].nblk_x = pitch;
-                       surface->u.legacy.level[0].slice_size_dw =
-                               ((uint64_t)pitch * surface->u.legacy.level[0].nblk_y * bpe) / 4;
-               }
-       }
-       return 0;
+   const struct util_format_description *desc = util_format_description(ptex->format);
+   bool is_depth, is_stencil;
+   int r;
+   unsigned bpe, flags = 0;
+
+   is_depth = util_format_has_depth(desc);
+   is_stencil = util_format_has_stencil(desc);
+
+   if (!is_flushed_depth && ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
+      bpe = 4; /* stencil is allocated separately */
+   } else {
+      bpe = util_format_get_blocksize(ptex->format);
+      assert(util_is_power_of_two_or_zero(bpe));
+   }
+
+   if (!is_flushed_depth && is_depth) {
+      flags |= RADEON_SURF_ZBUFFER;
+
+      if (sscreen->debug_flags & DBG(NO_HYPERZ)) {
+         flags |= RADEON_SURF_NO_HTILE;
+      } else if (tc_compatible_htile &&
+                 (sscreen->info.chip_class >= GFX9 || array_mode == RADEON_SURF_MODE_2D)) {
+         /* TC-compatible HTILE only supports Z32_FLOAT.
+          * GFX9 also supports Z16_UNORM.
+          * On GFX8, promote Z16 to Z32. DB->CB copies will convert
+          * the format for transfers.
+          */
+         if (sscreen->info.chip_class == GFX8)
+            bpe = 4;
+
+         flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
+      }
+
+      if (is_stencil)
+         flags |= RADEON_SURF_SBUFFER;
+   }
+
+   if (sscreen->info.chip_class >= GFX8 &&
+       (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC || ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT ||
+        (ptex->nr_samples >= 2 && !sscreen->dcc_msaa_allowed)))
+      flags |= RADEON_SURF_DISABLE_DCC;
+
+   /* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */
+   if (sscreen->info.family == CHIP_STONEY && bpe == 16 && ptex->nr_samples >= 2)
+      flags |= RADEON_SURF_DISABLE_DCC;
+
+   /* GFX8: DCC clear for 4x and 8x MSAA array textures unimplemented. */
+   if (sscreen->info.chip_class == GFX8 && ptex->nr_storage_samples >= 4 && ptex->array_size > 1)
+      flags |= RADEON_SURF_DISABLE_DCC;
+
+   /* GFX9: DCC clear for 4x and 8x MSAA textures unimplemented. */
+   if (sscreen->info.chip_class == GFX9 &&
+       (ptex->nr_storage_samples >= 4 ||
+        (sscreen->info.family == CHIP_RAVEN && ptex->nr_storage_samples >= 2 && bpe < 4)))
+      flags |= RADEON_SURF_DISABLE_DCC;
+
+   /* TODO: GFX10: DCC causes corruption with MSAA. */
+   if (sscreen->info.chip_class >= GFX10 && ptex->nr_storage_samples >= 2)
+      flags |= RADEON_SURF_DISABLE_DCC;
+
+   /* Shared textures must always set up DCC.
+    * If it's not present, it will be disabled by
+    * si_get_opaque_metadata later.
+    */
+   if (!is_imported && (sscreen->debug_flags & DBG(NO_DCC)))
+      flags |= RADEON_SURF_DISABLE_DCC;
+
+   if (is_scanout) {
+      /* This should catch bugs in gallium users setting incorrect flags. */
+      assert(ptex->nr_samples <= 1 && ptex->array_size == 1 && ptex->depth0 == 1 &&
+             ptex->last_level == 0 && !(flags & RADEON_SURF_Z_OR_SBUFFER));
+
+      flags |= RADEON_SURF_SCANOUT;
+   }
+
+   if (ptex->bind & PIPE_BIND_SHARED)
+      flags |= RADEON_SURF_SHAREABLE;
+   if (is_imported)
+      flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE;
+   if (!(ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING))
+      flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE;
+   if (sscreen->debug_flags & DBG(NO_FMASK))
+      flags |= RADEON_SURF_NO_FMASK;
+
+   if (sscreen->info.chip_class == GFX9 && (ptex->flags & SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE)) {
+      flags |= RADEON_SURF_FORCE_MICRO_TILE_MODE;
+      surface->micro_tile_mode = SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(ptex->flags);
+   }
+
+   if (sscreen->info.chip_class >= GFX10 && (ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING)) {
+      flags |= RADEON_SURF_FORCE_SWIZZLE_MODE;
+      surface->u.gfx9.surf.swizzle_mode = ADDR_SW_64KB_R_X;
+   }
+
+   r = sscreen->ws->surface_init(sscreen->ws, ptex, flags, bpe, array_mode, surface);
+   if (r) {
+      return r;
+   }
+
+   unsigned pitch = pitch_in_bytes_override / bpe;
+
+   if (sscreen->info.chip_class >= GFX9) {
+      if (pitch) {
+         surface->u.gfx9.surf_pitch = pitch;
+         if (ptex->last_level == 0)
+            surface->u.gfx9.surf.epitch = pitch - 1;
+         surface->u.gfx9.surf_slice_size = (uint64_t)pitch * surface->u.gfx9.surf_height * bpe;
+      }
+   } else {
+      if (pitch) {
+         surface->u.legacy.level[0].nblk_x = pitch;
+         surface->u.legacy.level[0].slice_size_dw =
+            ((uint64_t)pitch * surface->u.legacy.level[0].nblk_y * bpe) / 4;
+      }
+   }
+   return 0;
  }
  
-static void si_get_display_metadata(struct si_screen *sscreen,
-                                   struct radeon_surf *surf,
-                                   struct radeon_bo_metadata *metadata,
-                                   enum radeon_surf_mode *array_mode,
-                                   bool *is_scanout)
+static void si_get_display_metadata(struct si_screen *sscreen, struct radeon_surf *surf,
+                                    struct radeon_bo_metadata *metadata,
+                                    enum radeon_surf_mode *array_mode, bool *is_scanout)
  {
-       if (sscreen->info.chip_class >= GFX9) {
-               if (metadata->u.gfx9.swizzle_mode > 0)
-                       *array_mode = RADEON_SURF_MODE_2D;
-               else
-                       *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-               surf->u.gfx9.surf.swizzle_mode = metadata->u.gfx9.swizzle_mode;
-               *is_scanout = metadata->u.gfx9.scanout;
-
-               if (metadata->u.gfx9.dcc_offset_256B) {
-                       surf->u.gfx9.display_dcc_pitch_max = metadata->u.gfx9.dcc_pitch_max;
-                       assert(metadata->u.gfx9.dcc_independent_64B == 1);
-               }
-       } else {
-               surf->u.legacy.pipe_config = metadata->u.legacy.pipe_config;
-               surf->u.legacy.bankw = metadata->u.legacy.bankw;
-               surf->u.legacy.bankh = metadata->u.legacy.bankh;
-               surf->u.legacy.tile_split = metadata->u.legacy.tile_split;
-               surf->u.legacy.mtilea = metadata->u.legacy.mtilea;
-               surf->u.legacy.num_banks = metadata->u.legacy.num_banks;
-
-               if (metadata->u.legacy.macrotile == RADEON_LAYOUT_TILED)
-                       *array_mode = RADEON_SURF_MODE_2D;
-               else if (metadata->u.legacy.microtile == RADEON_LAYOUT_TILED)
-                       *array_mode = RADEON_SURF_MODE_1D;
-               else
-                       *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-               *is_scanout = metadata->u.legacy.scanout;
-       }
+   if (sscreen->info.chip_class >= GFX9) {
+      if (metadata->u.gfx9.swizzle_mode > 0)
+         *array_mode = RADEON_SURF_MODE_2D;
+      else
+         *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+      surf->u.gfx9.surf.swizzle_mode = metadata->u.gfx9.swizzle_mode;
+      *is_scanout = metadata->u.gfx9.scanout;
+
+      if (metadata->u.gfx9.dcc_offset_256B) {
+         surf->u.gfx9.display_dcc_pitch_max = metadata->u.gfx9.dcc_pitch_max;
+         assert(metadata->u.gfx9.dcc_independent_64B == 1);
+      }
+   } else {
+      surf->u.legacy.pipe_config = metadata->u.legacy.pipe_config;
+      surf->u.legacy.bankw = metadata->u.legacy.bankw;
+      surf->u.legacy.bankh = metadata->u.legacy.bankh;
+      surf->u.legacy.tile_split = metadata->u.legacy.tile_split;
+      surf->u.legacy.mtilea = metadata->u.legacy.mtilea;
+      surf->u.legacy.num_banks = metadata->u.legacy.num_banks;
+
+      if (metadata->u.legacy.macrotile == RADEON_LAYOUT_TILED)
+         *array_mode = RADEON_SURF_MODE_2D;
+      else if (metadata->u.legacy.microtile == RADEON_LAYOUT_TILED)
+         *array_mode = RADEON_SURF_MODE_1D;
+      else
+         *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+      *is_scanout = metadata->u.legacy.scanout;
+   }
  }
  
-void si_eliminate_fast_color_clear(struct si_context *sctx,
-                                  struct si_texture *tex)
+void si_eliminate_fast_color_clear(struct si_context *sctx, struct si_texture *tex)
  {
-       struct si_screen *sscreen = sctx->screen;
-       struct pipe_context *ctx = &sctx->b;
+   struct si_screen *sscreen = sctx->screen;
+   struct pipe_context *ctx = &sctx->b;
  
-       if (ctx == sscreen->aux_context)
-               simple_mtx_lock(&sscreen->aux_context_lock);
+   if (ctx == sscreen->aux_context)
+      simple_mtx_lock(&sscreen->aux_context_lock);
  
-       unsigned n = sctx->num_decompress_calls;
-       ctx->flush_resource(ctx, &tex->buffer.b.b);
+   unsigned n = sctx->num_decompress_calls;
+   ctx->flush_resource(ctx, &tex->buffer.b.b);
  
-       /* Flush only if any fast clear elimination took place. */
-       if (n != sctx->num_decompress_calls)
-               ctx->flush(ctx, NULL, 0);
+   /* Flush only if any fast clear elimination took place. */
+   if (n != sctx->num_decompress_calls)
+      ctx->flush(ctx, NULL, 0);
  
-       if (ctx == sscreen->aux_context)
-               simple_mtx_unlock(&sscreen->aux_context_lock);
+   if (ctx == sscreen->aux_context)
+      simple_mtx_unlock(&sscreen->aux_context_lock);
  }
  
-void si_texture_discard_cmask(struct si_screen *sscreen,
-                             struct si_texture *tex)
+void si_texture_discard_cmask(struct si_screen *sscreen, struct si_texture *tex)
  {
-       if (!tex->cmask_buffer)
-               return;
+   if (!tex->cmask_buffer)
+      return;
  
-       assert(tex->buffer.b.b.nr_samples <= 1);
+   assert(tex->buffer.b.b.nr_samples <= 1);
  
-       /* Disable CMASK. */
-       tex->cmask_base_address_reg = tex->buffer.gpu_address >> 8;
-       tex->dirty_level_mask = 0;
+   /* Disable CMASK. */
+   tex->cmask_base_address_reg = tex->buffer.gpu_address >> 8;
+   tex->dirty_level_mask = 0;
  
-       tex->cb_color_info &= ~S_028C70_FAST_CLEAR(1);
+   tex->cb_color_info &= ~S_028C70_FAST_CLEAR(1);
  
-       if (tex->cmask_buffer != &tex->buffer)
-           si_resource_reference(&tex->cmask_buffer, NULL);
+   if (tex->cmask_buffer != &tex->buffer)
+      si_resource_reference(&tex->cmask_buffer, NULL);
  
-       tex->cmask_buffer = NULL;
+   tex->cmask_buffer = NULL;
  
-       /* Notify all contexts about the change. */
-       p_atomic_inc(&sscreen->dirty_tex_counter);
-       p_atomic_inc(&sscreen->compressed_colortex_counter);
+   /* Notify all contexts about the change. */
+   p_atomic_inc(&sscreen->dirty_tex_counter);
+   p_atomic_inc(&sscreen->compressed_colortex_counter);
  }
  
  static bool si_can_disable_dcc(struct si_texture *tex)
  {
-       /* We can't disable DCC if it can be written by another process. */
-       return tex->surface.dcc_offset &&
-              (!tex->buffer.b.is_shared ||
-               !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE));
+   /* We can't disable DCC if it can be written by another process. */
+   return tex->surface.dcc_offset &&
+          (!tex->buffer.b.is_shared ||
+           !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE));
  }
  
  static void si_texture_zero_dcc_fields(struct si_texture *tex)
  {
-       tex->surface.dcc_offset = 0;
-       tex->surface.display_dcc_offset = 0;
-       tex->surface.dcc_retile_map_offset = 0;
+   tex->surface.dcc_offset = 0;
+   tex->surface.display_dcc_offset = 0;
+   tex->surface.dcc_retile_map_offset = 0;
  }
  
-static bool si_texture_discard_dcc(struct si_screen *sscreen,
-                                  struct si_texture *tex)
+static bool si_texture_discard_dcc(struct si_screen *sscreen, struct si_texture *tex)
  {
-       if (!si_can_disable_dcc(tex))
-               return false;
+   if (!si_can_disable_dcc(tex))
+      return false;
  
-       assert(tex->dcc_separate_buffer == NULL);
+   assert(tex->dcc_separate_buffer == NULL);
  
-       /* Disable DCC. */
-       si_texture_zero_dcc_fields(tex);
+   /* Disable DCC. */
+   si_texture_zero_dcc_fields(tex);
  
-       /* Notify all contexts about the change. */
-       p_atomic_inc(&sscreen->dirty_tex_counter);
-       return true;
+   /* Notify all contexts about the change. */
+   p_atomic_inc(&sscreen->dirty_tex_counter);
+   return true;
  }
  
  /**
@@ -502,783 +459,726 @@ static bool si_texture_discard_dcc(struct si_screen *sscreen,
   * \param sctx  the current context if you have one, or sscreen->aux_context
   *              if you don't.
   */
-bool si_texture_disable_dcc(struct si_context *sctx,
-                           struct si_texture *tex)
+bool si_texture_disable_dcc(struct si_context *sctx, struct si_texture *tex)
  {
-       struct si_screen *sscreen = sctx->screen;
+   struct si_screen *sscreen = sctx->screen;
  
-       if (!sctx->has_graphics)
-               return si_texture_discard_dcc(sscreen, tex);
+   if (!sctx->has_graphics)
+      return si_texture_discard_dcc(sscreen, tex);
  
-       if (!si_can_disable_dcc(tex))
-               return false;
+   if (!si_can_disable_dcc(tex))
+      return false;
  
-       if (&sctx->b == sscreen->aux_context)
-               simple_mtx_lock(&sscreen->aux_context_lock);
+   if (&sctx->b == sscreen->aux_context)
+      simple_mtx_lock(&sscreen->aux_context_lock);
  
-       /* Decompress DCC. */
-       si_decompress_dcc(sctx, tex);
-       sctx->b.flush(&sctx->b, NULL, 0);
+   /* Decompress DCC. */
+   si_decompress_dcc(sctx, tex);
+   sctx->b.flush(&sctx->b, NULL, 0);
  
-       if (&sctx->b == sscreen->aux_context)
-               simple_mtx_unlock(&sscreen->aux_context_lock);
+   if (&sctx->b == sscreen->aux_context)
+      simple_mtx_unlock(&sscreen->aux_context_lock);
  
-       return si_texture_discard_dcc(sscreen, tex);
+   return si_texture_discard_dcc(sscreen, tex);
  }
  
-static void si_reallocate_texture_inplace(struct si_context *sctx,
-                                         struct si_texture *tex,
-                                         unsigned new_bind_flag,
-                                         bool invalidate_storage)
+static void si_reallocate_texture_inplace(struct si_context *sctx, struct si_texture *tex,
+                                          unsigned new_bind_flag, bool invalidate_storage)
  {
-       struct pipe_screen *screen = sctx->b.screen;
-       struct si_texture *new_tex;
-       struct pipe_resource templ = tex->buffer.b.b;
-       unsigned i;
-
-       templ.bind |= new_bind_flag;
-
-       if (tex->buffer.b.is_shared || tex->num_planes > 1)
-               return;
-
-       if (new_bind_flag == PIPE_BIND_LINEAR) {
-               if (tex->surface.is_linear)
-                       return;
-
-               /* This fails with MSAA, depth, and compressed textures. */
-               if (si_choose_tiling(sctx->screen, &templ, false) !=
-                   RADEON_SURF_MODE_LINEAR_ALIGNED)
-                       return;
-       }
-
-       new_tex = (struct si_texture*)screen->resource_create(screen, &templ);
-       if (!new_tex)
-               return;
-
-       /* Copy the pixels to the new texture. */
-       if (!invalidate_storage) {
-               for (i = 0; i <= templ.last_level; i++) {
-                       struct pipe_box box;
-
-                       u_box_3d(0, 0, 0,
-                                u_minify(templ.width0, i), u_minify(templ.height0, i),
-                                util_num_layers(&templ, i), &box);
-
-                       sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0,
-                                      &tex->buffer.b.b, i, &box);
-               }
-       }
-
-       if (new_bind_flag == PIPE_BIND_LINEAR) {
-               si_texture_discard_cmask(sctx->screen, tex);
-               si_texture_discard_dcc(sctx->screen, tex);
-       }
-
-       /* Replace the structure fields of tex. */
-       tex->buffer.b.b.bind = templ.bind;
-       pb_reference(&tex->buffer.buf, new_tex->buffer.buf);
-       tex->buffer.gpu_address = new_tex->buffer.gpu_address;
-       tex->buffer.vram_usage = new_tex->buffer.vram_usage;
-       tex->buffer.gart_usage = new_tex->buffer.gart_usage;
-       tex->buffer.bo_size = new_tex->buffer.bo_size;
-       tex->buffer.bo_alignment = new_tex->buffer.bo_alignment;
-       tex->buffer.domains = new_tex->buffer.domains;
-       tex->buffer.flags = new_tex->buffer.flags;
-
-       tex->surface = new_tex->surface;
-       si_texture_reference(&tex->flushed_depth_texture,
-                            new_tex->flushed_depth_texture);
-
-       tex->surface.fmask_offset = new_tex->surface.fmask_offset;
-       tex->surface.cmask_offset = new_tex->surface.cmask_offset;
-       tex->cmask_base_address_reg = new_tex->cmask_base_address_reg;
-
-       if (tex->cmask_buffer == &tex->buffer)
-               tex->cmask_buffer = NULL;
-       else
-               si_resource_reference(&tex->cmask_buffer, NULL);
-
-       if (new_tex->cmask_buffer == &new_tex->buffer)
-               tex->cmask_buffer = &tex->buffer;
-       else
-               si_resource_reference(&tex->cmask_buffer, new_tex->cmask_buffer);
-
-       tex->surface.dcc_offset = new_tex->surface.dcc_offset;
-       tex->cb_color_info = new_tex->cb_color_info;
-       memcpy(tex->color_clear_value, new_tex->color_clear_value,
-              sizeof(tex->color_clear_value));
-       tex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode;
-
-       tex->surface.htile_offset = new_tex->surface.htile_offset;
-       tex->depth_clear_value = new_tex->depth_clear_value;
-       tex->dirty_level_mask = new_tex->dirty_level_mask;
-       tex->stencil_dirty_level_mask = new_tex->stencil_dirty_level_mask;
-       tex->db_render_format = new_tex->db_render_format;
-       tex->stencil_clear_value = new_tex->stencil_clear_value;
-       tex->tc_compatible_htile = new_tex->tc_compatible_htile;
-       tex->depth_cleared = new_tex->depth_cleared;
-       tex->stencil_cleared = new_tex->stencil_cleared;
-       tex->upgraded_depth = new_tex->upgraded_depth;
-       tex->db_compatible = new_tex->db_compatible;
-       tex->can_sample_z = new_tex->can_sample_z;
-       tex->can_sample_s = new_tex->can_sample_s;
-
-       tex->separate_dcc_dirty = new_tex->separate_dcc_dirty;
-       tex->displayable_dcc_dirty = new_tex->displayable_dcc_dirty;
-       tex->dcc_gather_statistics = new_tex->dcc_gather_statistics;
-       si_resource_reference(&tex->dcc_separate_buffer,
-                               new_tex->dcc_separate_buffer);
-       si_resource_reference(&tex->last_dcc_separate_buffer,
-                               new_tex->last_dcc_separate_buffer);
-
-       if (new_bind_flag == PIPE_BIND_LINEAR) {
-               assert(!tex->surface.htile_offset);
-               assert(!tex->cmask_buffer);
-               assert(!tex->surface.fmask_size);
-               assert(!tex->surface.dcc_offset);
-               assert(!tex->is_depth);
-       }
-
-       si_texture_reference(&new_tex, NULL);
-
-       p_atomic_inc(&sctx->screen->dirty_tex_counter);
+   struct pipe_screen *screen = sctx->b.screen;
+   struct si_texture *new_tex;
+   struct pipe_resource templ = tex->buffer.b.b;
+   unsigned i;
+
+   templ.bind |= new_bind_flag;
+
+   if (tex->buffer.b.is_shared || tex->num_planes > 1)
+      return;
+
+   if (new_bind_flag == PIPE_BIND_LINEAR) {
+      if (tex->surface.is_linear)
+         return;
+
+      /* This fails with MSAA, depth, and compressed textures. */
+      if (si_choose_tiling(sctx->screen, &templ, false) != RADEON_SURF_MODE_LINEAR_ALIGNED)
+         return;
+   }
+
+   new_tex = (struct si_texture *)screen->resource_create(screen, &templ);
+   if (!new_tex)
+      return;
+
+   /* Copy the pixels to the new texture. */
+   if (!invalidate_storage) {
+      for (i = 0; i <= templ.last_level; i++) {
+         struct pipe_box box;
+
+         u_box_3d(0, 0, 0, u_minify(templ.width0, i), u_minify(templ.height0, i),
+                  util_num_layers(&templ, i), &box);
+
+         sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0, &tex->buffer.b.b, i, &box);
+      }
+   }
+
+   if (new_bind_flag == PIPE_BIND_LINEAR) {
+      si_texture_discard_cmask(sctx->screen, tex);
+      si_texture_discard_dcc(sctx->screen, tex);
+   }
+
+   /* Replace the structure fields of tex. */
+   tex->buffer.b.b.bind = templ.bind;
+   pb_reference(&tex->buffer.buf, new_tex->buffer.buf);
+   tex->buffer.gpu_address = new_tex->buffer.gpu_address;
+   tex->buffer.vram_usage = new_tex->buffer.vram_usage;
+   tex->buffer.gart_usage = new_tex->buffer.gart_usage;
+   tex->buffer.bo_size = new_tex->buffer.bo_size;
+   tex->buffer.bo_alignment = new_tex->buffer.bo_alignment;
+   tex->buffer.domains = new_tex->buffer.domains;
+   tex->buffer.flags = new_tex->buffer.flags;
+
+   tex->surface = new_tex->surface;
+   si_texture_reference(&tex->flushed_depth_texture, new_tex->flushed_depth_texture);
+
+   tex->surface.fmask_offset = new_tex->surface.fmask_offset;
+   tex->surface.cmask_offset = new_tex->surface.cmask_offset;
+   tex->cmask_base_address_reg = new_tex->cmask_base_address_reg;
+
+   if (tex->cmask_buffer == &tex->buffer)
+      tex->cmask_buffer = NULL;
+   else
+      si_resource_reference(&tex->cmask_buffer, NULL);
+
+   if (new_tex->cmask_buffer == &new_tex->buffer)
+      tex->cmask_buffer = &tex->buffer;
+   else
+      si_resource_reference(&tex->cmask_buffer, new_tex->cmask_buffer);
+
+   tex->surface.dcc_offset = new_tex->surface.dcc_offset;
+   tex->cb_color_info = new_tex->cb_color_info;
+   memcpy(tex->color_clear_value, new_tex->color_clear_value, sizeof(tex->color_clear_value));
+   tex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode;
+
+   tex->surface.htile_offset = new_tex->surface.htile_offset;
+   tex->depth_clear_value = new_tex->depth_clear_value;
+   tex->dirty_level_mask = new_tex->dirty_level_mask;
+   tex->stencil_dirty_level_mask = new_tex->stencil_dirty_level_mask;
+   tex->db_render_format = new_tex->db_render_format;
+   tex->stencil_clear_value = new_tex->stencil_clear_value;
+   tex->tc_compatible_htile = new_tex->tc_compatible_htile;
+   tex->depth_cleared = new_tex->depth_cleared;
+   tex->stencil_cleared = new_tex->stencil_cleared;
+   tex->upgraded_depth = new_tex->upgraded_depth;
+   tex->db_compatible = new_tex->db_compatible;
+   tex->can_sample_z = new_tex->can_sample_z;
+   tex->can_sample_s = new_tex->can_sample_s;
+
+   tex->separate_dcc_dirty = new_tex->separate_dcc_dirty;
+   tex->displayable_dcc_dirty = new_tex->displayable_dcc_dirty;
+   tex->dcc_gather_statistics = new_tex->dcc_gather_statistics;
+   si_resource_reference(&tex->dcc_separate_buffer, new_tex->dcc_separate_buffer);
+   si_resource_reference(&tex->last_dcc_separate_buffer, new_tex->last_dcc_separate_buffer);
+
+   if (new_bind_flag == PIPE_BIND_LINEAR) {
+      assert(!tex->surface.htile_offset);
+      assert(!tex->cmask_buffer);
+      assert(!tex->surface.fmask_size);
+      assert(!tex->surface.dcc_offset);
+      assert(!tex->is_depth);
+   }
+
+   si_texture_reference(&new_tex, NULL);
+
+   p_atomic_inc(&sctx->screen->dirty_tex_counter);
  }
  
  static uint32_t si_get_bo_metadata_word1(struct si_screen *sscreen)
  {
-       return (ATI_VENDOR_ID << 16) | sscreen->info.pci_id;
+   return (ATI_VENDOR_ID << 16) | sscreen->info.pci_id;
  }
  
-static void si_set_tex_bo_metadata(struct si_screen *sscreen,
-                                  struct si_texture *tex)
+static void si_set_tex_bo_metadata(struct si_screen *sscreen, struct si_texture *tex)
  {
-       struct radeon_surf *surface = &tex->surface;
-       struct pipe_resource *res = &tex->buffer.b.b;
-       struct radeon_bo_metadata md;
-
-       memset(&md, 0, sizeof(md));
-
-       if (sscreen->info.chip_class >= GFX9) {
-               md.u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode;
-               md.u.gfx9.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
-
-               if (tex->surface.dcc_offset && !tex->dcc_separate_buffer) {
-                       uint64_t dcc_offset =
-                               tex->surface.display_dcc_offset ? tex->surface.display_dcc_offset
-                                                       : tex->surface.dcc_offset;
-
-                       assert((dcc_offset >> 8) != 0 && (dcc_offset >> 8) < (1 << 24));
-                       md.u.gfx9.dcc_offset_256B = dcc_offset >> 8;
-                       md.u.gfx9.dcc_pitch_max = tex->surface.u.gfx9.display_dcc_pitch_max;
-                       md.u.gfx9.dcc_independent_64B = 1;
-               }
-       } else {
-               md.u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D ?
-                                          RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
-               md.u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D ?
-                                          RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
-               md.u.legacy.pipe_config = surface->u.legacy.pipe_config;
-               md.u.legacy.bankw = surface->u.legacy.bankw;
-               md.u.legacy.bankh = surface->u.legacy.bankh;
-               md.u.legacy.tile_split = surface->u.legacy.tile_split;
-               md.u.legacy.mtilea = surface->u.legacy.mtilea;
-               md.u.legacy.num_banks = surface->u.legacy.num_banks;
-               md.u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe;
-               md.u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
-       }
-
-       assert(tex->dcc_separate_buffer == NULL);
-       assert(tex->surface.fmask_size == 0);
-
-       /* Metadata image format format version 1:
-        * [0] = 1 (metadata format identifier)
-        * [1] = (VENDOR_ID << 16) | PCI_ID
-        * [2:9] = image descriptor for the whole resource
-        *         [2] is always 0, because the base address is cleared
-        *         [9] is the DCC offset bits [39:8] from the beginning of
-        *             the buffer
-        * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level
-        */
-
-       md.metadata[0] = 1; /* metadata image format version 1 */
-
-       /* TILE_MODE_INDEX is ambiguous without a PCI ID. */
-       md.metadata[1] = si_get_bo_metadata_word1(sscreen);
-
-       static const unsigned char swizzle[] = {
-               PIPE_SWIZZLE_X,
-               PIPE_SWIZZLE_Y,
-               PIPE_SWIZZLE_Z,
-               PIPE_SWIZZLE_W
-       };
-       bool is_array = util_texture_is_array(res->target);
-       uint32_t desc[8];
-
-       sscreen->make_texture_descriptor(sscreen, tex, true,
-                                  res->target, res->format,
-                                  swizzle, 0, res->last_level, 0,
-                                  is_array ? res->array_size - 1 : 0,
-                                  res->width0, res->height0, res->depth0,
-                                  desc, NULL);
-
-       si_set_mutable_tex_desc_fields(sscreen, tex, &tex->surface.u.legacy.level[0],
-                                      0, 0, tex->surface.blk_w, false, desc);
-
-       /* Clear the base address and set the relative DCC offset. */
-       desc[0] = 0;
-       desc[1] &= C_008F14_BASE_ADDRESS_HI;
-
-       switch (sscreen->info.chip_class) {
-       case GFX6:
-       case GFX7:
-               break;
-       case GFX8:
-               desc[7] = tex->surface.dcc_offset >> 8;
-               break;
-       case GFX9:
-               desc[7] = tex->surface.dcc_offset >> 8;
-               desc[5] &= C_008F24_META_DATA_ADDRESS;
-               desc[5] |= S_008F24_META_DATA_ADDRESS(tex->surface.dcc_offset >> 40);
-               break;
-       case GFX10:
-               desc[6] &= C_00A018_META_DATA_ADDRESS_LO;
-               desc[6] |= S_00A018_META_DATA_ADDRESS_LO(tex->surface.dcc_offset >> 8);
-               desc[7] = tex->surface.dcc_offset >> 16;
-               break;
-       default:
-               assert(0);
-       }
-
-
-       /* Dwords [2:9] contain the image descriptor. */
-       memcpy(&md.metadata[2], desc, sizeof(desc));
-       md.size_metadata = 10 * 4;
-
-       /* Dwords [10:..] contain the mipmap level offsets. */
-       if (sscreen->info.chip_class <= GFX8) {
-               for (unsigned i = 0; i <= res->last_level; i++)
-                       md.metadata[10+i] = tex->surface.u.legacy.level[i].offset >> 8;
-
-               md.size_metadata += (1 + res->last_level) * 4;
-       }
-
-       sscreen->ws->buffer_set_metadata(tex->buffer.buf, &md);
+   struct radeon_surf *surface = &tex->surface;
+   struct pipe_resource *res = &tex->buffer.b.b;
+   struct radeon_bo_metadata md;
+
+   memset(&md, 0, sizeof(md));
+
+   if (sscreen->info.chip_class >= GFX9) {
+      md.u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode;
+      md.u.gfx9.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
+
+      if (tex->surface.dcc_offset && !tex->dcc_separate_buffer) {
+         uint64_t dcc_offset = tex->surface.display_dcc_offset ? tex->surface.display_dcc_offset
+                                                               : tex->surface.dcc_offset;
+
+         assert((dcc_offset >> 8) != 0 && (dcc_offset >> 8) < (1 << 24));
+         md.u.gfx9.dcc_offset_256B = dcc_offset >> 8;
+         md.u.gfx9.dcc_pitch_max = tex->surface.u.gfx9.display_dcc_pitch_max;
+         md.u.gfx9.dcc_independent_64B = 1;
+      }
+   } else {
+      md.u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D
+                                 ? RADEON_LAYOUT_TILED
+                                 : RADEON_LAYOUT_LINEAR;
+      md.u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D
+                                 ? RADEON_LAYOUT_TILED
+                                 : RADEON_LAYOUT_LINEAR;
+      md.u.legacy.pipe_config = surface->u.legacy.pipe_config;
+      md.u.legacy.bankw = surface->u.legacy.bankw;
+      md.u.legacy.bankh = surface->u.legacy.bankh;
+      md.u.legacy.tile_split = surface->u.legacy.tile_split;
+      md.u.legacy.mtilea = surface->u.legacy.mtilea;
+      md.u.legacy.num_banks = surface->u.legacy.num_banks;
+      md.u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe;
+      md.u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
+   }
+
+   assert(tex->dcc_separate_buffer == NULL);
+   assert(tex->surface.fmask_size == 0);
+
+   /* Metadata image format format version 1:
+    * [0] = 1 (metadata format identifier)
+    * [1] = (VENDOR_ID << 16) | PCI_ID
+    * [2:9] = image descriptor for the whole resource
+    *         [2] is always 0, because the base address is cleared
+    *         [9] is the DCC offset bits [39:8] from the beginning of
+    *             the buffer
+    * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level
+    */
+
+   md.metadata[0] = 1; /* metadata image format version 1 */
+
+   /* TILE_MODE_INDEX is ambiguous without a PCI ID. */
+   md.metadata[1] = si_get_bo_metadata_word1(sscreen);
+
+   static const unsigned char swizzle[] = {PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z,
+                                           PIPE_SWIZZLE_W};
+   bool is_array = util_texture_is_array(res->target);
+   uint32_t desc[8];
+
+   sscreen->make_texture_descriptor(sscreen, tex, true, res->target, res->format, swizzle, 0,
+                                    res->last_level, 0, is_array ? res->array_size - 1 : 0,
+                                    res->width0, res->height0, res->depth0, desc, NULL);
+
+   si_set_mutable_tex_desc_fields(sscreen, tex, &tex->surface.u.legacy.level[0], 0, 0,
+                                  tex->surface.blk_w, false, desc);
+
+   /* Clear the base address and set the relative DCC offset. */
+   desc[0] = 0;
+   desc[1] &= C_008F14_BASE_ADDRESS_HI;
+
+   switch (sscreen->info.chip_class) {
+   case GFX6:
+   case GFX7:
+      break;
+   case GFX8:
+      desc[7] = tex->surface.dcc_offset >> 8;
+      break;
+   case GFX9:
+      desc[7] = tex->surface.dcc_offset >> 8;
+      desc[5] &= C_008F24_META_DATA_ADDRESS;
+      desc[5] |= S_008F24_META_DATA_ADDRESS(tex->surface.dcc_offset >> 40);
+      break;
+   case GFX10:
+      desc[6] &= C_00A018_META_DATA_ADDRESS_LO;
+      desc[6] |= S_00A018_META_DATA_ADDRESS_LO(tex->surface.dcc_offset >> 8);
+      desc[7] = tex->surface.dcc_offset >> 16;
+      break;
+   default:
+      assert(0);
+   }
+
+   /* Dwords [2:9] contain the image descriptor. */
+   memcpy(&md.metadata[2], desc, sizeof(desc));
+   md.size_metadata = 10 * 4;
+
+   /* Dwords [10:..] contain the mipmap level offsets. */
+   if (sscreen->info.chip_class <= GFX8) {
+      for (unsigned i = 0; i <= res->last_level; i++)
+         md.metadata[10 + i] = tex->surface.u.legacy.level[i].offset >> 8;
+
+      md.size_metadata += (1 + res->last_level) * 4;
+   }
+
+   sscreen->ws->buffer_set_metadata(tex->buffer.buf, &md);
  }
  
-static bool si_read_tex_bo_metadata(struct si_screen *sscreen,
-                                   struct si_texture *tex,
-                                   uint64_t offset,
-                                   struct radeon_bo_metadata *md)
+static bool si_read_tex_bo_metadata(struct si_screen *sscreen, struct si_texture *tex,
+                                    uint64_t offset, struct radeon_bo_metadata *md)
  {
-       uint32_t *desc = &md->metadata[2];
-
-       if (offset || /* Non-zero planes ignore metadata. */
-           md->size_metadata < 10 * 4 || /* at least 2(header) + 8(desc) dwords */
-           md->metadata[0] == 0 || /* invalid version number */
-           md->metadata[1] != si_get_bo_metadata_word1(sscreen)) /* invalid PCI ID */ {
-               /* Disable DCC because it might not be enabled. */
-               si_texture_zero_dcc_fields(tex);
-
-               /* Don't report an error if the texture comes from an incompatible driver,
-                * but this might not work.
-                */
-               return true;
-       }
-
-       /* Validate that sample counts and the number of mipmap levels match. */
-       unsigned last_level = G_008F1C_LAST_LEVEL(desc[3]);
-       unsigned type = G_008F1C_TYPE(desc[3]);
-
-       if (type == V_008F1C_SQ_RSRC_IMG_2D_MSAA ||
-           type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
-               unsigned log_samples =
-                       util_logbase2(MAX2(1, tex->buffer.b.b.nr_storage_samples));
-
-               if (last_level != log_samples) {
-                       fprintf(stderr, "radeonsi: invalid MSAA texture import, "
-                                       "metadata has log2(samples) = %u, the caller set %u\n",
-                               last_level, log_samples);
-                       return false;
-               }
-       } else {
-               if (last_level != tex->buffer.b.b.last_level) {
-                       fprintf(stderr, "radeonsi: invalid mipmapped texture import, "
-                                       "metadata has last_level = %u, the caller set %u\n",
-                               last_level, tex->buffer.b.b.last_level);
-                       return false;
-               }
-       }
-
-       if (sscreen->info.chip_class >= GFX8 &&
-           G_008F28_COMPRESSION_EN(desc[6])) {
-               /* Read DCC information. */
-               switch (sscreen->info.chip_class) {
-               case GFX8:
-                       tex->surface.dcc_offset = (uint64_t)desc[7] << 8;
-                       break;
-
-               case GFX9:
-                       tex->surface.dcc_offset =
-                               ((uint64_t)desc[7] << 8) |
-                               ((uint64_t)G_008F24_META_DATA_ADDRESS(desc[5]) << 40);
-                       tex->surface.u.gfx9.dcc.pipe_aligned =
-                               G_008F24_META_PIPE_ALIGNED(desc[5]);
-                       tex->surface.u.gfx9.dcc.rb_aligned =
-                               G_008F24_META_RB_ALIGNED(desc[5]);
-
-                       /* If DCC is unaligned, this can only be a displayable image. */
-                       if (!tex->surface.u.gfx9.dcc.pipe_aligned &&
-                           !tex->surface.u.gfx9.dcc.rb_aligned)
-                               assert(tex->surface.is_displayable);
-                       break;
-
-               case GFX10:
-                       tex->surface.dcc_offset =
-                               ((uint64_t)G_00A018_META_DATA_ADDRESS_LO(desc[6]) << 8) |
-                               ((uint64_t)desc[7] << 16);
-                       tex->surface.u.gfx9.dcc.pipe_aligned =
-                               G_00A018_META_PIPE_ALIGNED(desc[6]);
-                       break;
-
-               default:
-                       assert(0);
-                       return false;
-               }
-       } else {
-               /* Disable DCC. dcc_offset is always set by texture_from_handle
-                * and must be cleared here.
-                */
-               si_texture_zero_dcc_fields(tex);
-       }
-
-       return true;
+   uint32_t *desc = &md->metadata[2];
+
+   if (offset ||                     /* Non-zero planes ignore metadata. */
+       md->size_metadata < 10 * 4 || /* at least 2(header) + 8(desc) dwords */
+       md->metadata[0] == 0 ||       /* invalid version number */
+       md->metadata[1] != si_get_bo_metadata_word1(sscreen)) /* invalid PCI ID */ {
+      /* Disable DCC because it might not be enabled. */
+      si_texture_zero_dcc_fields(tex);
+
+      /* Don't report an error if the texture comes from an incompatible driver,
+       * but this might not work.
+       */
+      return true;
+   }
+
+   /* Validate that sample counts and the number of mipmap levels match. */
+   unsigned last_level = G_008F1C_LAST_LEVEL(desc[3]);
+   unsigned type = G_008F1C_TYPE(desc[3]);
+
+   if (type == V_008F1C_SQ_RSRC_IMG_2D_MSAA || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
+      unsigned log_samples = util_logbase2(MAX2(1, tex->buffer.b.b.nr_storage_samples));
+
+      if (last_level != log_samples) {
+         fprintf(stderr,
+                 "radeonsi: invalid MSAA texture import, "
+                 "metadata has log2(samples) = %u, the caller set %u\n",
+                 last_level, log_samples);
+         return false;
+      }
+   } else {
+      if (last_level != tex->buffer.b.b.last_level) {
+         fprintf(stderr,
+                 "radeonsi: invalid mipmapped texture import, "
+                 "metadata has last_level = %u, the caller set %u\n",
+                 last_level, tex->buffer.b.b.last_level);
+         return false;
+      }
+   }
+
+   if (sscreen->info.chip_class >= GFX8 && G_008F28_COMPRESSION_EN(desc[6])) {
+      /* Read DCC information. */
+      switch (sscreen->info.chip_class) {
+      case GFX8:
+         tex->surface.dcc_offset = (uint64_t)desc[7] << 8;
+         break;
+
+      case GFX9:
+         tex->surface.dcc_offset =
+            ((uint64_t)desc[7] << 8) | ((uint64_t)G_008F24_META_DATA_ADDRESS(desc[5]) << 40);
+         tex->surface.u.gfx9.dcc.pipe_aligned = G_008F24_META_PIPE_ALIGNED(desc[5]);
+         tex->surface.u.gfx9.dcc.rb_aligned = G_008F24_META_RB_ALIGNED(desc[5]);
+
+         /* If DCC is unaligned, this can only be a displayable image. */
+         if (!tex->surface.u.gfx9.dcc.pipe_aligned && !tex->surface.u.gfx9.dcc.rb_aligned)
+            assert(tex->surface.is_displayable);
+         break;
+
+      case GFX10:
+         tex->surface.dcc_offset =
+            ((uint64_t)G_00A018_META_DATA_ADDRESS_LO(desc[6]) << 8) | ((uint64_t)desc[7] << 16);
+         tex->surface.u.gfx9.dcc.pipe_aligned = G_00A018_META_PIPE_ALIGNED(desc[6]);
+         break;
+
+      default:
+         assert(0);
+         return false;
+      }
+   } else {
+      /* Disable DCC. dcc_offset is always set by texture_from_handle
+       * and must be cleared here.
+       */
+      si_texture_zero_dcc_fields(tex);
+   }
+
+   return true;
  }
  
  static bool si_has_displayable_dcc(struct si_texture *tex)
  {
-       struct si_screen *sscreen = (struct si_screen*)tex->buffer.b.b.screen;
-
-       if (sscreen->info.chip_class <= GFX8)
-               return false;
-
-       /* This needs a cache flush before scanout.
-        * (it can't be scanned out and rendered to simultaneously)
-        */
-       if (sscreen->info.use_display_dcc_unaligned &&
-           tex->surface.dcc_offset &&
-           !tex->surface.u.gfx9.dcc.pipe_aligned &&
-           !tex->surface.u.gfx9.dcc.rb_aligned)
-               return true;
-
-       /* This needs an explicit flush (flush_resource). */
-       if (sscreen->info.use_display_dcc_with_retile_blit &&
-           tex->surface.display_dcc_offset)
-               return true;
-
-       return false;
+   struct si_screen *sscreen = (struct si_screen *)tex->buffer.b.b.screen;
+
+   if (sscreen->info.chip_class <= GFX8)
+      return false;
+
+   /* This needs a cache flush before scanout.
+    * (it can't be scanned out and rendered to simultaneously)
+    */
+   if (sscreen->info.use_display_dcc_unaligned && tex->surface.dcc_offset &&
+       !tex->surface.u.gfx9.dcc.pipe_aligned && !tex->surface.u.gfx9.dcc.rb_aligned)
+      return true;
+
+   /* This needs an explicit flush (flush_resource). */
+   if (sscreen->info.use_display_dcc_with_retile_blit && tex->surface.display_dcc_offset)
+      return true;
+
+   return false;
  }
  
-static bool si_resource_get_param(struct pipe_screen *screen,
-                                 struct pipe_context *context,
-                                 struct pipe_resource *resource,
-                                 unsigned plane,
-                                 unsigned layer,
-                                 enum pipe_resource_param param,
-                                 unsigned handle_usage,
-                                 uint64_t *value)
+static bool si_resource_get_param(struct pipe_screen *screen, struct pipe_context *context,
+                                  struct pipe_resource *resource, unsigned plane, unsigned layer,
+                                  enum pipe_resource_param param, unsigned handle_usage,
+                                  uint64_t *value)
  {
-       for (unsigned i = 0; i < plane; i++)
-               resource = resource->next;
-
-       struct si_screen *sscreen = (struct si_screen*)screen;
-       struct si_texture *tex = (struct si_texture*)resource;
-       struct winsys_handle whandle;
-
-       switch (param) {
-       case PIPE_RESOURCE_PARAM_NPLANES:
-               *value = resource->target == PIPE_BUFFER ? 1 : tex->num_planes;
-               return true;
-
-       case PIPE_RESOURCE_PARAM_STRIDE:
-               if (resource->target == PIPE_BUFFER)
-                       *value = 0;
-               else if (sscreen->info.chip_class >= GFX9)
-                       *value = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
-               else
-                       *value = tex->surface.u.legacy.level[0].nblk_x * tex->surface.bpe;
-               return true;
-
-       case PIPE_RESOURCE_PARAM_OFFSET:
-               if (resource->target == PIPE_BUFFER)
-                       *value = 0;
-               else if (sscreen->info.chip_class >= GFX9)
-                       *value = tex->surface.u.gfx9.surf_offset +
-                                layer * tex->surface.u.gfx9.surf_slice_size;
-               else
-                       *value = tex->surface.u.legacy.level[0].offset +
-                                layer * (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
-               return true;
-
-       case PIPE_RESOURCE_PARAM_MODIFIER:
-               *value = DRM_FORMAT_MOD_INVALID;
-               return true;
-
-       case PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED:
-       case PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS:
-       case PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD:
-               memset(&whandle, 0, sizeof(whandle));
-
-               if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED)
-                       whandle.type = WINSYS_HANDLE_TYPE_SHARED;
-               else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS)
-                       whandle.type = WINSYS_HANDLE_TYPE_KMS;
-               else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD)
-                       whandle.type = WINSYS_HANDLE_TYPE_FD;
-
-               if (!screen->resource_get_handle(screen, context, resource,
-                                                &whandle, handle_usage))
-                       return false;
-
-               *value = whandle.handle;
-               return true;
-       }
-       return false;
+   for (unsigned i = 0; i < plane; i++)
+      resource = resource->next;
+
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_texture *tex = (struct si_texture *)resource;
+   struct winsys_handle whandle;
+
+   switch (param) {
+   case PIPE_RESOURCE_PARAM_NPLANES:
+      *value = resource->target == PIPE_BUFFER ? 1 : tex->num_planes;
+      return true;
+
+   case PIPE_RESOURCE_PARAM_STRIDE:
+      if (resource->target == PIPE_BUFFER)
+         *value = 0;
+      else if (sscreen->info.chip_class >= GFX9)
+         *value = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
+      else
+         *value = tex->surface.u.legacy.level[0].nblk_x * tex->surface.bpe;
+      return true;
+
+   case PIPE_RESOURCE_PARAM_OFFSET:
+      if (resource->target == PIPE_BUFFER)
+         *value = 0;
+      else if (sscreen->info.chip_class >= GFX9)
+         *value = tex->surface.u.gfx9.surf_offset + layer * tex->surface.u.gfx9.surf_slice_size;
+      else
+         *value = tex->surface.u.legacy.level[0].offset +
+                  layer * (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
+      return true;
+
+   case PIPE_RESOURCE_PARAM_MODIFIER:
+      *value = DRM_FORMAT_MOD_INVALID;
+      return true;
+
+   case PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED:
+   case PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS:
+   case PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD:
+      memset(&whandle, 0, sizeof(whandle));
+
+      if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED)
+         whandle.type = WINSYS_HANDLE_TYPE_SHARED;
+      else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS)
+         whandle.type = WINSYS_HANDLE_TYPE_KMS;
+      else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD)
+         whandle.type = WINSYS_HANDLE_TYPE_FD;
+
+      if (!screen->resource_get_handle(screen, context, resource, &whandle, handle_usage))
+         return false;
+
+      *value = whandle.handle;
+      return true;
+   }
+   return false;
  }
  
-static void si_texture_get_info(struct pipe_screen* screen,
-                               struct pipe_resource *resource,
-                               unsigned *pstride,
-                               unsigned *poffset)
+static void si_texture_get_info(struct pipe_screen *screen, struct pipe_resource *resource,
+                                unsigned *pstride, unsigned *poffset)
  {
-       uint64_t value;
-
-       if (pstride) {
-               si_resource_get_param(screen, NULL, resource, 0, 0,
-                                     PIPE_RESOURCE_PARAM_STRIDE, 0, &value);
-               *pstride = value;
-       }
-
-       if (poffset) {
-               si_resource_get_param(screen, NULL, resource, 0, 0,
-                                     PIPE_RESOURCE_PARAM_OFFSET, 0, &value);
-               *poffset = value;
-       }
+   uint64_t value;
+
+   if (pstride) {
+      si_resource_get_param(screen, NULL, resource, 0, 0, PIPE_RESOURCE_PARAM_STRIDE, 0, &value);
+      *pstride = value;
+   }
+
+   if (poffset) {
+      si_resource_get_param(screen, NULL, resource, 0, 0, PIPE_RESOURCE_PARAM_OFFSET, 0, &value);
+      *poffset = value;
+   }
  }
  
-static bool si_texture_get_handle(struct pipe_screen* screen,
-                                 struct pipe_context *ctx,
-                                 struct pipe_resource *resource,
-                                 struct winsys_handle *whandle,
-                                 unsigned usage)
+static bool si_texture_get_handle(struct pipe_screen *screen, struct pipe_context *ctx,
+                                  struct pipe_resource *resource, struct winsys_handle *whandle,
+                                  unsigned usage)
  {
-       struct si_screen *sscreen = (struct si_screen*)screen;
-       struct si_context *sctx;
-       struct si_resource *res = si_resource(resource);
-       struct si_texture *tex = (struct si_texture*)resource;
-       bool update_metadata = false;
-       unsigned stride, offset, slice_size;
-       bool flush = false;
-
-       ctx = threaded_context_unwrap_sync(ctx);
-       sctx = (struct si_context*)(ctx ? ctx : sscreen->aux_context);
-
-       if (resource->target != PIPE_BUFFER) {
-               /* Individual planes are chained pipe_resource instances. */
-               for (unsigned i = 0; i < whandle->plane; i++) {
-                       resource = resource->next;
-                       res = si_resource(resource);
-                       tex = (struct si_texture*)resource;
-               }
-
-               /* This is not supported now, but it might be required for OpenCL
-                * interop in the future.
-                */
-               if (resource->nr_samples > 1 || tex->is_depth)
-                       return false;
-
-               /* Move a suballocated texture into a non-suballocated allocation. */
-               if (sscreen->ws->buffer_is_suballocated(res->buf) ||
-                   tex->surface.tile_swizzle ||
-                   (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
-                    sscreen->info.has_local_buffers)) {
-                       assert(!res->b.is_shared);
-                       si_reallocate_texture_inplace(sctx, tex,
-                                                       PIPE_BIND_SHARED, false);
-                       flush = true;
-                       assert(res->b.b.bind & PIPE_BIND_SHARED);
-                       assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
-                       assert(!(res->flags & RADEON_FLAG_NO_INTERPROCESS_SHARING));
-                       assert(tex->surface.tile_swizzle == 0);
-               }
-
-               /* Since shader image stores don't support DCC on GFX8,
-                * disable it for external clients that want write
-                * access.
-                */
-               if ((usage & PIPE_HANDLE_USAGE_SHADER_WRITE && tex->surface.dcc_offset) ||
-                   /* Displayable DCC requires an explicit flush. */
-                   (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
-                    si_has_displayable_dcc(tex))) {
-                       if (si_texture_disable_dcc(sctx, tex)) {
-                               update_metadata = true;
-                               /* si_texture_disable_dcc flushes the context */
-                               flush = false;
-                       }
-               }
-
-               if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
-                   (tex->cmask_buffer || tex->surface.dcc_offset)) {
-                       /* Eliminate fast clear (both CMASK and DCC) */
-                       si_eliminate_fast_color_clear(sctx, tex);
-                       /* eliminate_fast_color_clear flushes the context */
-                       flush = false;
-
-                       /* Disable CMASK if flush_resource isn't going
-                        * to be called.
-                        */
-                       if (tex->cmask_buffer)
-                               si_texture_discard_cmask(sscreen, tex);
-               }
-
-               /* Set metadata. */
-               if ((!res->b.is_shared || update_metadata) && whandle->offset == 0)
-                       si_set_tex_bo_metadata(sscreen, tex);
-
-               if (sscreen->info.chip_class >= GFX9) {
-                       slice_size = tex->surface.u.gfx9.surf_slice_size;
-               } else {
-                       slice_size = (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
-               }
-       } else {
-               /* Buffer exports are for the OpenCL interop. */
-               /* Move a suballocated buffer into a non-suballocated allocation. */
-               if (sscreen->ws->buffer_is_suballocated(res->buf) ||
-                   /* A DMABUF export always fails if the BO is local. */
-                   (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
-                    sscreen->info.has_local_buffers)) {
-                       assert(!res->b.is_shared);
-
-                       /* Allocate a new buffer with PIPE_BIND_SHARED. */
-                       struct pipe_resource templ = res->b.b;
-                       templ.bind |= PIPE_BIND_SHARED;
-
-                       struct pipe_resource *newb =
-                               screen->resource_create(screen, &templ);
-                       if (!newb)
-                               return false;
-
-                       /* Copy the old buffer contents to the new one. */
-                       struct pipe_box box;
-                       u_box_1d(0, newb->width0, &box);
-                       sctx->b.resource_copy_region(&sctx->b, newb, 0, 0, 0, 0,
-                                                    &res->b.b, 0, &box);
-                       flush = true;
-                       /* Move the new buffer storage to the old pipe_resource. */
-                       si_replace_buffer_storage(&sctx->b, &res->b.b, newb);
-                       pipe_resource_reference(&newb, NULL);
-
-                       assert(res->b.b.bind & PIPE_BIND_SHARED);
-                       assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
-               }
-
-               /* Buffers */
-               slice_size = 0;
-       }
-
-       si_texture_get_info(screen, resource, &stride, &offset);
-
-       if (flush)
-               sctx->b.flush(&sctx->b, NULL, 0);
-
-       if (res->b.is_shared) {
-               /* USAGE_EXPLICIT_FLUSH must be cleared if at least one user
-                * doesn't set it.
-                */
-               res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
-               if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
-                       res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
-       } else {
-               res->b.is_shared = true;
-               res->external_usage = usage;
-       }
-
-       whandle->stride = stride;
-       whandle->offset = offset + slice_size * whandle->layer;
-
-       return sscreen->ws->buffer_get_handle(sscreen->ws, res->buf, whandle);
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_context *sctx;
+   struct si_resource *res = si_resource(resource);
+   struct si_texture *tex = (struct si_texture *)resource;
+   bool update_metadata = false;
+   unsigned stride, offset, slice_size;
+   bool flush = false;
+
+   ctx = threaded_context_unwrap_sync(ctx);
+   sctx = (struct si_context *)(ctx ? ctx : sscreen->aux_context);
+
+   if (resource->target != PIPE_BUFFER) {
+      /* Individual planes are chained pipe_resource instances. */
+      for (unsigned i = 0; i < whandle->plane; i++) {
+         resource = resource->next;
+         res = si_resource(resource);
+         tex = (struct si_texture *)resource;
+      }
+
+      /* This is not supported now, but it might be required for OpenCL
+       * interop in the future.
+       */
+      if (resource->nr_samples > 1 || tex->is_depth)
+         return false;
+
+      /* Move a suballocated texture into a non-suballocated allocation. */
+      if (sscreen->ws->buffer_is_suballocated(res->buf) || tex->surface.tile_swizzle ||
+          (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
+           sscreen->info.has_local_buffers)) {
+         assert(!res->b.is_shared);
+         si_reallocate_texture_inplace(sctx, tex, PIPE_BIND_SHARED, false);
+         flush = true;
+         assert(res->b.b.bind & PIPE_BIND_SHARED);
+         assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
+         assert(!(res->flags & RADEON_FLAG_NO_INTERPROCESS_SHARING));
+         assert(tex->surface.tile_swizzle == 0);
+      }
+
+      /* Since shader image stores don't support DCC on GFX8,
+       * disable it for external clients that want write
+       * access.
+       */
+      if ((usage & PIPE_HANDLE_USAGE_SHADER_WRITE && tex->surface.dcc_offset) ||
+          /* Displayable DCC requires an explicit flush. */
+          (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && si_has_displayable_dcc(tex))) {
+         if (si_texture_disable_dcc(sctx, tex)) {
+            update_metadata = true;
+            /* si_texture_disable_dcc flushes the context */
+            flush = false;
+         }
+      }
+
+      if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
+          (tex->cmask_buffer || tex->surface.dcc_offset)) {
+         /* Eliminate fast clear (both CMASK and DCC) */
+         si_eliminate_fast_color_clear(sctx, tex);
+         /* eliminate_fast_color_clear flushes the context */
+         flush = false;
+
+         /* Disable CMASK if flush_resource isn't going
+          * to be called.
+          */
+         if (tex->cmask_buffer)
+            si_texture_discard_cmask(sscreen, tex);
+      }
+
+      /* Set metadata. */
+      if ((!res->b.is_shared || update_metadata) && whandle->offset == 0)
+         si_set_tex_bo_metadata(sscreen, tex);
+
+      if (sscreen->info.chip_class >= GFX9) {
+         slice_size = tex->surface.u.gfx9.surf_slice_size;
+      } else {
+         slice_size = (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
+      }
+   } else {
+      /* Buffer exports are for the OpenCL interop. */
+      /* Move a suballocated buffer into a non-suballocated allocation. */
+      if (sscreen->ws->buffer_is_suballocated(res->buf) ||
+          /* A DMABUF export always fails if the BO is local. */
+          (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
+           sscreen->info.has_local_buffers)) {
+         assert(!res->b.is_shared);
+
+         /* Allocate a new buffer with PIPE_BIND_SHARED. */
+         struct pipe_resource templ = res->b.b;
+         templ.bind |= PIPE_BIND_SHARED;
+
+         struct pipe_resource *newb = screen->resource_create(screen, &templ);
+         if (!newb)
+            return false;
+
+         /* Copy the old buffer contents to the new one. */
+         struct pipe_box box;
+         u_box_1d(0, newb->width0, &box);
+         sctx->b.resource_copy_region(&sctx->b, newb, 0, 0, 0, 0, &res->b.b, 0, &box);
+         flush = true;
+         /* Move the new buffer storage to the old pipe_resource. */
+         si_replace_buffer_storage(&sctx->b, &res->b.b, newb);
+         pipe_resource_reference(&newb, NULL);
+
+         assert(res->b.b.bind & PIPE_BIND_SHARED);
+         assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
+      }
+
+      /* Buffers */
+      slice_size = 0;
+   }
+
+   si_texture_get_info(screen, resource, &stride, &offset);
+
+   if (flush)
+      sctx->b.flush(&sctx->b, NULL, 0);
+
+   if (res->b.is_shared) {
+      /* USAGE_EXPLICIT_FLUSH must be cleared if at least one user
+       * doesn't set it.
+       */
+      res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+      if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+         res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+   } else {
+      res->b.is_shared = true;
+      res->external_usage = usage;
+   }
+
+   whandle->stride = stride;
+   whandle->offset = offset + slice_size * whandle->layer;
+
+   return sscreen->ws->buffer_get_handle(sscreen->ws, res->buf, whandle);
  }
  
-static void si_texture_destroy(struct pipe_screen *screen,
-                              struct pipe_resource *ptex)
+static void si_texture_destroy(struct pipe_screen *screen, struct pipe_resource *ptex)
  {
-       struct si_screen *sscreen = (struct si_screen*)screen;
-       struct si_texture *tex = (struct si_texture*)ptex;
-       struct si_resource *resource = &tex->buffer;
-
-       if (sscreen->info.chip_class >= GFX9)
-               free(tex->surface.u.gfx9.dcc_retile_map);
-
-       si_texture_reference(&tex->flushed_depth_texture, NULL);
-
-       if (tex->cmask_buffer != &tex->buffer) {
-           si_resource_reference(&tex->cmask_buffer, NULL);
-       }
-       pb_reference(&resource->buf, NULL);
-       si_resource_reference(&tex->dcc_separate_buffer, NULL);
-       si_resource_reference(&tex->last_dcc_separate_buffer, NULL);
-       FREE(tex);
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_texture *tex = (struct si_texture *)ptex;
+   struct si_resource *resource = &tex->buffer;
+
+   if (sscreen->info.chip_class >= GFX9)
+      free(tex->surface.u.gfx9.dcc_retile_map);
+
+   si_texture_reference(&tex->flushed_depth_texture, NULL);
+
+   if (tex->cmask_buffer != &tex->buffer) {
+      si_resource_reference(&tex->cmask_buffer, NULL);
+   }
+   pb_reference(&resource->buf, NULL);
+   si_resource_reference(&tex->dcc_separate_buffer, NULL);
+   si_resource_reference(&tex->last_dcc_separate_buffer, NULL);
+   FREE(tex);
  }
  
  static const struct u_resource_vtbl si_texture_vtbl;
  
-void si_print_texture_info(struct si_screen *sscreen,
-                          struct si_texture *tex, struct u_log_context *log)
+void si_print_texture_info(struct si_screen *sscreen, struct si_texture *tex,
+                           struct u_log_context *log)
  {
-       int i;
-
-       /* Common parameters. */
-       u_log_printf(log, "  Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
-               "blk_h=%u, array_size=%u, last_level=%u, "
-               "bpe=%u, nsamples=%u, flags=0x%x, %s\n",
-               tex->buffer.b.b.width0, tex->buffer.b.b.height0,
-               tex->buffer.b.b.depth0, tex->surface.blk_w,
-               tex->surface.blk_h,
-               tex->buffer.b.b.array_size, tex->buffer.b.b.last_level,
-               tex->surface.bpe, tex->buffer.b.b.nr_samples,
-               tex->surface.flags, util_format_short_name(tex->buffer.b.b.format));
-
-       if (sscreen->info.chip_class >= GFX9) {
-               u_log_printf(log, "  Surf: size=%"PRIu64", slice_size=%"PRIu64", "
-                       "alignment=%u, swmode=%u, epitch=%u, pitch=%u\n",
-                       tex->surface.surf_size,
-                       tex->surface.u.gfx9.surf_slice_size,
-                       tex->surface.surf_alignment,
-                       tex->surface.u.gfx9.surf.swizzle_mode,
-                       tex->surface.u.gfx9.surf.epitch,
-                       tex->surface.u.gfx9.surf_pitch);
-
-               if (tex->surface.fmask_offset) {
-                       u_log_printf(log, "  FMASK: offset=%"PRIu64", size=%"PRIu64", "
-                               "alignment=%u, swmode=%u, epitch=%u\n",
-                               tex->surface.fmask_offset,
-                               tex->surface.fmask_size,
-                               tex->surface.fmask_alignment,
-                               tex->surface.u.gfx9.fmask.swizzle_mode,
-                               tex->surface.u.gfx9.fmask.epitch);
-               }
-
-               if (tex->cmask_buffer) {
-                       u_log_printf(log, "  CMask: offset=%"PRIu64", size=%u, "
-                               "alignment=%u, rb_aligned=%u, pipe_aligned=%u\n",
-                               tex->surface.cmask_offset,
-                               tex->surface.cmask_size,
-                               tex->surface.cmask_alignment,
-                               tex->surface.u.gfx9.cmask.rb_aligned,
-                               tex->surface.u.gfx9.cmask.pipe_aligned);
-               }
-
-               if (tex->surface.htile_offset) {
-                       u_log_printf(log, "  HTile: offset=%"PRIu64", size=%u, alignment=%u, "
-                               "rb_aligned=%u, pipe_aligned=%u\n",
-                               tex->surface.htile_offset,
-                               tex->surface.htile_size,
-                               tex->surface.htile_alignment,
-                               tex->surface.u.gfx9.htile.rb_aligned,
-                               tex->surface.u.gfx9.htile.pipe_aligned);
-               }
-
-               if (tex->surface.dcc_offset) {
-                       u_log_printf(log, "  DCC: offset=%"PRIu64", size=%u, "
-                               "alignment=%u, pitch_max=%u, num_dcc_levels=%u\n",
-                               tex->surface.dcc_offset, tex->surface.dcc_size,
-                               tex->surface.dcc_alignment,
-                               tex->surface.u.gfx9.display_dcc_pitch_max,
-                               tex->surface.num_dcc_levels);
-               }
-
-               if (tex->surface.u.gfx9.stencil_offset) {
-                       u_log_printf(log, "  Stencil: offset=%"PRIu64", swmode=%u, epitch=%u\n",
-                               tex->surface.u.gfx9.stencil_offset,
-                               tex->surface.u.gfx9.stencil.swizzle_mode,
-                               tex->surface.u.gfx9.stencil.epitch);
-               }
-               return;
-       }
-
-       u_log_printf(log, "  Layout: size=%"PRIu64", alignment=%u, bankw=%u, "
-               "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n",
-               tex->surface.surf_size, tex->surface.surf_alignment, tex->surface.u.legacy.bankw,
-               tex->surface.u.legacy.bankh, tex->surface.u.legacy.num_banks, tex->surface.u.legacy.mtilea,
-               tex->surface.u.legacy.tile_split, tex->surface.u.legacy.pipe_config,
-               (tex->surface.flags & RADEON_SURF_SCANOUT) != 0);
-
-       if (tex->surface.fmask_offset)
-               u_log_printf(log, "  FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, "
-                       "bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n",
-                       tex->surface.fmask_offset, tex->surface.fmask_size, tex->surface.fmask_alignment,
-                       tex->surface.u.legacy.fmask.pitch_in_pixels,
-                       tex->surface.u.legacy.fmask.bankh,
-                       tex->surface.u.legacy.fmask.slice_tile_max,
-                       tex->surface.u.legacy.fmask.tiling_index);
-
-       if (tex->cmask_buffer)
-               u_log_printf(log, "  CMask: offset=%"PRIu64", size=%u, alignment=%u, "
-                       "slice_tile_max=%u\n",
-                       tex->surface.cmask_offset, tex->surface.cmask_size, tex->surface.cmask_alignment,
-                       tex->surface.u.legacy.cmask_slice_tile_max);
-
-       if (tex->surface.htile_offset)
-               u_log_printf(log, "  HTile: offset=%"PRIu64", size=%u, "
-                       "alignment=%u, TC_compatible = %u\n",
-                       tex->surface.htile_offset, tex->surface.htile_size,
-                       tex->surface.htile_alignment,
-                       tex->tc_compatible_htile);
-
-       if (tex->surface.dcc_offset) {
-               u_log_printf(log, "  DCC: offset=%"PRIu64", size=%u, alignment=%u\n",
-                       tex->surface.dcc_offset, tex->surface.dcc_size,
-                       tex->surface.dcc_alignment);
-               for (i = 0; i <= tex->buffer.b.b.last_level; i++)
-                       u_log_printf(log, "  DCCLevel[%i]: enabled=%u, offset=%u, "
-                               "fast_clear_size=%u\n",
-                               i, i < tex->surface.num_dcc_levels,
-                               tex->surface.u.legacy.level[i].dcc_offset,
-                               tex->surface.u.legacy.level[i].dcc_fast_clear_size);
-       }
-
-       for (i = 0; i <= tex->buffer.b.b.last_level; i++)
-               u_log_printf(log, "  Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", "
-                       "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
-                       "mode=%u, tiling_index = %u\n",
-                       i, tex->surface.u.legacy.level[i].offset,
-                       (uint64_t)tex->surface.u.legacy.level[i].slice_size_dw * 4,
-                       u_minify(tex->buffer.b.b.width0, i),
-                       u_minify(tex->buffer.b.b.height0, i),
-                       u_minify(tex->buffer.b.b.depth0, i),
-                       tex->surface.u.legacy.level[i].nblk_x,
-                       tex->surface.u.legacy.level[i].nblk_y,
-                       tex->surface.u.legacy.level[i].mode,
-                       tex->surface.u.legacy.tiling_index[i]);
-
-       if (tex->surface.has_stencil) {
-               u_log_printf(log, "  StencilLayout: tilesplit=%u\n",
-                       tex->surface.u.legacy.stencil_tile_split);
-               for (i = 0; i <= tex->buffer.b.b.last_level; i++) {
-                       u_log_printf(log, "  StencilLevel[%i]: offset=%"PRIu64", "
-                               "slice_size=%"PRIu64", npix_x=%u, "
-                               "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
-                               "mode=%u, tiling_index = %u\n",
-                               i, tex->surface.u.legacy.stencil_level[i].offset,
-                               (uint64_t)tex->surface.u.legacy.stencil_level[i].slice_size_dw * 4,
-                               u_minify(tex->buffer.b.b.width0, i),
-                               u_minify(tex->buffer.b.b.height0, i),
-                               u_minify(tex->buffer.b.b.depth0, i),
-                               tex->surface.u.legacy.stencil_level[i].nblk_x,
-                               tex->surface.u.legacy.stencil_level[i].nblk_y,
-                               tex->surface.u.legacy.stencil_level[i].mode,
-                               tex->surface.u.legacy.stencil_tiling_index[i]);
-               }
-       }
+   int i;
+
+   /* Common parameters. */
+   u_log_printf(log,
+                "  Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
+                "blk_h=%u, array_size=%u, last_level=%u, "
+                "bpe=%u, nsamples=%u, flags=0x%x, %s\n",
+                tex->buffer.b.b.width0, tex->buffer.b.b.height0, tex->buffer.b.b.depth0,
+                tex->surface.blk_w, tex->surface.blk_h, tex->buffer.b.b.array_size,
+                tex->buffer.b.b.last_level, tex->surface.bpe, tex->buffer.b.b.nr_samples,
+                tex->surface.flags, util_format_short_name(tex->buffer.b.b.format));
+
+   if (sscreen->info.chip_class >= GFX9) {
+      u_log_printf(log,
+                   "  Surf: size=%" PRIu64 ", slice_size=%" PRIu64 ", "
+                   "alignment=%u, swmode=%u, epitch=%u, pitch=%u\n",
+                   tex->surface.surf_size, tex->surface.u.gfx9.surf_slice_size,
+                   tex->surface.surf_alignment, tex->surface.u.gfx9.surf.swizzle_mode,
+                   tex->surface.u.gfx9.surf.epitch, tex->surface.u.gfx9.surf_pitch);
+
+      if (tex->surface.fmask_offset) {
+         u_log_printf(log,
+                      "  FMASK: offset=%" PRIu64 ", size=%" PRIu64 ", "
+                      "alignment=%u, swmode=%u, epitch=%u\n",
+                      tex->surface.fmask_offset, tex->surface.fmask_size,
+                      tex->surface.fmask_alignment, tex->surface.u.gfx9.fmask.swizzle_mode,
+                      tex->surface.u.gfx9.fmask.epitch);
+      }
+
+      if (tex->cmask_buffer) {
+         u_log_printf(log,
+                      "  CMask: offset=%" PRIu64 ", size=%u, "
+                      "alignment=%u, rb_aligned=%u, pipe_aligned=%u\n",
+                      tex->surface.cmask_offset, tex->surface.cmask_size,
+                      tex->surface.cmask_alignment, tex->surface.u.gfx9.cmask.rb_aligned,
+                      tex->surface.u.gfx9.cmask.pipe_aligned);
+      }
+
+      if (tex->surface.htile_offset) {
+         u_log_printf(log,
+                      "  HTile: offset=%" PRIu64 ", size=%u, alignment=%u, "
+                      "rb_aligned=%u, pipe_aligned=%u\n",
+                      tex->surface.htile_offset, tex->surface.htile_size,
+                      tex->surface.htile_alignment, tex->surface.u.gfx9.htile.rb_aligned,
+                      tex->surface.u.gfx9.htile.pipe_aligned);
+      }
+
+      if (tex->surface.dcc_offset) {
+         u_log_printf(log,
+                      "  DCC: offset=%" PRIu64 ", size=%u, "
+                      "alignment=%u, pitch_max=%u, num_dcc_levels=%u\n",
+                      tex->surface.dcc_offset, tex->surface.dcc_size, tex->surface.dcc_alignment,
+                      tex->surface.u.gfx9.display_dcc_pitch_max, tex->surface.num_dcc_levels);
+      }
+
+      if (tex->surface.u.gfx9.stencil_offset) {
+         u_log_printf(log, "  Stencil: offset=%" PRIu64 ", swmode=%u, epitch=%u\n",
+                      tex->surface.u.gfx9.stencil_offset, tex->surface.u.gfx9.stencil.swizzle_mode,
+                      tex->surface.u.gfx9.stencil.epitch);
+      }
+      return;
+   }
+
+   u_log_printf(log,
+                "  Layout: size=%" PRIu64 ", alignment=%u, bankw=%u, "
+                "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n",
+                tex->surface.surf_size, tex->surface.surf_alignment, tex->surface.u.legacy.bankw,
+                tex->surface.u.legacy.bankh, tex->surface.u.legacy.num_banks,
+                tex->surface.u.legacy.mtilea, tex->surface.u.legacy.tile_split,
+                tex->surface.u.legacy.pipe_config, (tex->surface.flags & RADEON_SURF_SCANOUT) != 0);
+
+   if (tex->surface.fmask_offset)
+      u_log_printf(
+         log,
+         "  FMask: offset=%" PRIu64 ", size=%" PRIu64 ", alignment=%u, pitch_in_pixels=%u, "
+         "bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n",
+         tex->surface.fmask_offset, tex->surface.fmask_size, tex->surface.fmask_alignment,
+         tex->surface.u.legacy.fmask.pitch_in_pixels, tex->surface.u.legacy.fmask.bankh,
+         tex->surface.u.legacy.fmask.slice_tile_max, tex->surface.u.legacy.fmask.tiling_index);
+
+   if (tex->cmask_buffer)
+      u_log_printf(log,
+                   "  CMask: offset=%" PRIu64 ", size=%u, alignment=%u, "
+                   "slice_tile_max=%u\n",
+                   tex->surface.cmask_offset, tex->surface.cmask_size, tex->surface.cmask_alignment,
+                   tex->surface.u.legacy.cmask_slice_tile_max);
+
+   if (tex->surface.htile_offset)
+      u_log_printf(log,
+                   "  HTile: offset=%" PRIu64 ", size=%u, "
+                   "alignment=%u, TC_compatible = %u\n",
+                   tex->surface.htile_offset, tex->surface.htile_size, tex->surface.htile_alignment,
+                   tex->tc_compatible_htile);
+
+   if (tex->surface.dcc_offset) {
+      u_log_printf(log, "  DCC: offset=%" PRIu64 ", size=%u, alignment=%u\n",
+                   tex->surface.dcc_offset, tex->surface.dcc_size, tex->surface.dcc_alignment);
+      for (i = 0; i <= tex->buffer.b.b.last_level; i++)
+         u_log_printf(log,
+                      "  DCCLevel[%i]: enabled=%u, offset=%u, "
+                      "fast_clear_size=%u\n",
+                      i, i < tex->surface.num_dcc_levels, tex->surface.u.legacy.level[i].dcc_offset,
+                      tex->surface.u.legacy.level[i].dcc_fast_clear_size);
+   }
+
+   for (i = 0; i <= tex->buffer.b.b.last_level; i++)
+      u_log_printf(log,
+                   "  Level[%i]: offset=%" PRIu64 ", slice_size=%" PRIu64 ", "
+                   "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
+                   "mode=%u, tiling_index = %u\n",
+                   i, tex->surface.u.legacy.level[i].offset,
+                   (uint64_t)tex->surface.u.legacy.level[i].slice_size_dw * 4,
+                   u_minify(tex->buffer.b.b.width0, i), u_minify(tex->buffer.b.b.height0, i),
+                   u_minify(tex->buffer.b.b.depth0, i), tex->surface.u.legacy.level[i].nblk_x,
+                   tex->surface.u.legacy.level[i].nblk_y, tex->surface.u.legacy.level[i].mode,
+                   tex->surface.u.legacy.tiling_index[i]);
+
+   if (tex->surface.has_stencil) {
+      u_log_printf(log, "  StencilLayout: tilesplit=%u\n",
+                   tex->surface.u.legacy.stencil_tile_split);
+      for (i = 0; i <= tex->buffer.b.b.last_level; i++) {
+         u_log_printf(log,
+                      "  StencilLevel[%i]: offset=%" PRIu64 ", "
+                      "slice_size=%" PRIu64 ", npix_x=%u, "
+                      "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
+                      "mode=%u, tiling_index = %u\n",
+                      i, tex->surface.u.legacy.stencil_level[i].offset,
+                      (uint64_t)tex->surface.u.legacy.stencil_level[i].slice_size_dw * 4,
+                      u_minify(tex->buffer.b.b.width0, i), u_minify(tex->buffer.b.b.height0, i),
+                      u_minify(tex->buffer.b.b.depth0, i),
+                      tex->surface.u.legacy.stencil_level[i].nblk_x,
+                      tex->surface.u.legacy.stencil_level[i].nblk_y,
+                      tex->surface.u.legacy.stencil_level[i].mode,
+                      tex->surface.u.legacy.stencil_tiling_index[i]);
+      }
+   }
  }
  
  /**
@@ -1293,611 +1193,569 @@ void si_print_texture_info(struct si_screen *sscreen,
   * \param alloc_size   the size to allocate if plane0 != NULL
   * \param alignment    alignment for the allocation
   */
-static struct si_texture *
-si_texture_create_object(struct pipe_screen *screen,
-                        const struct pipe_resource *base,
-                        const struct radeon_surf *surface,
-                        const struct si_texture *plane0,
-                        struct pb_buffer *imported_buf,
-                        uint64_t offset,
-                        uint64_t alloc_size,
-                        unsigned alignment)
+static struct si_texture *si_texture_create_object(struct pipe_screen *screen,
+                                                   const struct pipe_resource *base,
+                                                   const struct radeon_surf *surface,
+                                                   const struct si_texture *plane0,
+                                                   struct pb_buffer *imported_buf, uint64_t offset,
+                                                   uint64_t alloc_size, unsigned alignment)
  {
-       struct si_texture *tex;
-       struct si_resource *resource;
-       struct si_screen *sscreen = (struct si_screen*)screen;
-
-       tex = CALLOC_STRUCT(si_texture);
-       if (!tex)
-               goto error;
-
-       resource = &tex->buffer;
-       resource->b.b = *base;
-       resource->b.b.next = NULL;
-       resource->b.vtbl = &si_texture_vtbl;
-       pipe_reference_init(&resource->b.b.reference, 1);
-       resource->b.b.screen = screen;
-
-       /* don't include stencil-only formats which we don't support for rendering */
-       tex->is_depth = util_format_has_depth(util_format_description(tex->buffer.b.b.format));
-       tex->surface = *surface;
-       tex->tc_compatible_htile = tex->surface.htile_size != 0 &&
-                                  (tex->surface.flags &
-                                   RADEON_SURF_TC_COMPATIBLE_HTILE);
-
-       /* TC-compatible HTILE:
-        * - GFX8 only supports Z32_FLOAT.
-        * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */
-       if (tex->tc_compatible_htile) {
-               if (sscreen->info.chip_class >= GFX9 &&
-                   base->format == PIPE_FORMAT_Z16_UNORM)
-                       tex->db_render_format = base->format;
-               else {
-                       tex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
-                       tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT &&
-                                              base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT;
-               }
-       } else {
-               tex->db_render_format = base->format;
-       }
-
-       /* Applies to GCN. */
-       tex->last_msaa_resolve_target_micro_mode = tex->surface.micro_tile_mode;
-
-       /* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers
-        * between frames, so the only thing that can enable separate DCC
-        * with DRI2 is multiple slow clears within a frame.
-        */
-       tex->ps_draw_ratio = 0;
-
-       if (sscreen->info.chip_class >= GFX9) {
-               tex->surface.u.gfx9.surf_offset = offset;
-       } else {
-               for (unsigned i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i)
-                       tex->surface.u.legacy.level[i].offset += offset;
-       }
-
-       if (tex->is_depth) {
-               if (sscreen->info.chip_class >= GFX9) {
-                       tex->can_sample_z = true;
-                       tex->can_sample_s = true;
-
-                       /* Stencil texturing with HTILE doesn't work
-                        * with mipmapping on Navi10-14. */
-                       if ((sscreen->info.family == CHIP_NAVI10 ||
-                            sscreen->info.family == CHIP_NAVI12 ||
-                            sscreen->info.family == CHIP_NAVI14) &&
-                           base->last_level > 0)
-                               tex->htile_stencil_disabled = true;
-               } else {
-                       tex->can_sample_z = !tex->surface.u.legacy.depth_adjusted;
-                       tex->can_sample_s = !tex->surface.u.legacy.stencil_adjusted;
-               }
-
-               tex->db_compatible = surface->flags & RADEON_SURF_ZBUFFER;
-       } else {
-               if (tex->surface.cmask_offset) {
-                       tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
-                       tex->cmask_buffer = &tex->buffer;
-               }
-       }
-
-       if (plane0) {
-               /* The buffer is shared with the first plane. */
-               resource->bo_size = plane0->buffer.bo_size;
-               resource->bo_alignment = plane0->buffer.bo_alignment;
-               resource->flags = plane0->buffer.flags;
-               resource->domains = plane0->buffer.domains;
-               resource->vram_usage = plane0->buffer.vram_usage;
-               resource->gart_usage = plane0->buffer.gart_usage;
-
-               pb_reference(&resource->buf, plane0->buffer.buf);
-               resource->gpu_address = plane0->buffer.gpu_address;
-       } else if (!(surface->flags & RADEON_SURF_IMPORTED)) {
-               /* Create the backing buffer. */
-               si_init_resource_fields(sscreen, resource, alloc_size, alignment);
-
-               if (!si_alloc_resource(sscreen, resource))
-                       goto error;
-       } else {
-               resource->buf = imported_buf;
-               resource->gpu_address = sscreen->ws->buffer_get_virtual_address(resource->buf);
-               resource->bo_size = imported_buf->size;
-               resource->bo_alignment = imported_buf->alignment;
-               resource->domains = sscreen->ws->buffer_get_initial_domain(resource->buf);
-               if (resource->domains & RADEON_DOMAIN_VRAM)
-                       resource->vram_usage = resource->bo_size;
-               else if (resource->domains & RADEON_DOMAIN_GTT)
-                       resource->gart_usage = resource->bo_size;
-       }
-
-       if (tex->cmask_buffer) {
-               /* Initialize the cmask to 0xCC (= compressed state). */
-               si_screen_clear_buffer(sscreen, &tex->cmask_buffer->b.b,
-                                        tex->surface.cmask_offset, tex->surface.cmask_size,
-                                        0xCCCCCCCC);
-       }
-       if (tex->surface.htile_offset) {
-               uint32_t clear_value = 0;
-
-               if (sscreen->info.chip_class >= GFX9 || tex->tc_compatible_htile)
-                       clear_value = 0x0000030F;
-
-               si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
-                                        tex->surface.htile_offset,
-                                        tex->surface.htile_size,
-                                        clear_value);
-       }
-
-       /* Initialize DCC only if the texture is not being imported. */
-       if (!(surface->flags & RADEON_SURF_IMPORTED) && tex->surface.dcc_offset) {
-               /* Clear DCC to black for all tiles with DCC enabled.
-                *
-                * This fixes corruption in 3DMark Slingshot Extreme, which
-                * uses uninitialized textures, causing corruption.
-                */
-               if (tex->surface.num_dcc_levels == tex->buffer.b.b.last_level + 1 &&
-                   tex->buffer.b.b.nr_samples <= 2) {
-                       /* Simple case - all tiles have DCC enabled. */
-                       si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
-                                              tex->surface.dcc_offset,
-                                              tex->surface.dcc_size,
-                                              DCC_CLEAR_COLOR_0000);
-               } else if (sscreen->info.chip_class >= GFX9) {
-                       /* Clear to uncompressed. Clearing this to black is complicated. */
-                       si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
-                                              tex->surface.dcc_offset,
-                                              tex->surface.dcc_size,
-                                              DCC_UNCOMPRESSED);
-               } else {
-                       /* GFX8: Initialize mipmap levels and multisamples separately. */
-                       if (tex->buffer.b.b.nr_samples >= 2) {
-                               /* Clearing this to black is complicated. */
-                               si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
-                                                      tex->surface.dcc_offset,
-                                                      tex->surface.dcc_size,
-                                                      DCC_UNCOMPRESSED);
-                       } else {
-                               /* Clear the enabled mipmap levels to black. */
-                               unsigned size = 0;
-
-                               for (unsigned i = 0; i < tex->surface.num_dcc_levels; i++) {
-                                       if (!tex->surface.u.legacy.level[i].dcc_fast_clear_size)
-                                               break;
-
-                                       size = tex->surface.u.legacy.level[i].dcc_offset +
-                                              tex->surface.u.legacy.level[i].dcc_fast_clear_size;
-                               }
-
-                               /* Mipmap levels with DCC. */
-                               if (size) {
-                                       si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
-                                                              tex->surface.dcc_offset, size,
-                                                              DCC_CLEAR_COLOR_0000);
-                               }
-                               /* Mipmap levels without DCC. */
-                               if (size != tex->surface.dcc_size) {
-                                       si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
-                                                              tex->surface.dcc_offset + size,
-                                                              tex->surface.dcc_size - size,
-                                                              DCC_UNCOMPRESSED);
-                               }
-                       }
-               }
-
-               /* Initialize displayable DCC that requires the retile blit. */
-               if (tex->surface.dcc_retile_map_offset) {
-                       /* Uninitialized DCC can hang the display hw.
-                        * Clear to white to indicate that. */
-                       si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
-                                              tex->surface.display_dcc_offset,
-                                              tex->surface.u.gfx9.display_dcc_size,
-                                              DCC_CLEAR_COLOR_1111);
-
-                       /* Upload the DCC retile map.
-                        * Use a staging buffer for the upload, because
-                        * the buffer backing the texture is unmappable.
-                        */
-                       bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
-                       unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
-                       struct si_resource *buf =
-                               si_aligned_buffer_create(screen, 0, PIPE_USAGE_STREAM,
-                                                        num_elements * (use_uint16 ? 2 : 4),
-                                                        sscreen->info.tcc_cache_line_size);
-                       uint32_t *ui = (uint32_t*)sscreen->ws->buffer_map(buf->buf, NULL,
-                                                                         PIPE_TRANSFER_WRITE);
-                       uint16_t *us = (uint16_t*)ui;
-
-                       /* Upload the retile map into a staging buffer. */
-                       if (use_uint16) {
-                               for (unsigned i = 0; i < num_elements; i++)
-                                       us[i] = tex->surface.u.gfx9.dcc_retile_map[i];
-                       } else {
-                               for (unsigned i = 0; i < num_elements; i++)
-                                       ui[i] = tex->surface.u.gfx9.dcc_retile_map[i];
-                       }
-
-                       /* Copy the staging buffer to the buffer backing the texture. */
-                       struct si_context *sctx = (struct si_context*)sscreen->aux_context;
-
-                       assert(tex->surface.dcc_retile_map_offset <= UINT_MAX);
-                       simple_mtx_lock(&sscreen->aux_context_lock);
-                       si_sdma_copy_buffer(sctx, &tex->buffer.b.b, &buf->b.b,
-                                           tex->surface.dcc_retile_map_offset,
-                                           0, buf->b.b.width0);
-                       sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
-                       simple_mtx_unlock(&sscreen->aux_context_lock);
-
-                       si_resource_reference(&buf, NULL);
-               }
-       }
-
-       /* Initialize the CMASK base register value. */
-       tex->cmask_base_address_reg =
-               (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8;
-
-       if (sscreen->debug_flags & DBG(VM)) {
-               fprintf(stderr, "VM start=0x%"PRIX64"  end=0x%"PRIX64" | Texture %ix%ix%i, %i levels, %i samples, %s\n",
-                       tex->buffer.gpu_address,
-                       tex->buffer.gpu_address + tex->buffer.buf->size,
-                       base->width0, base->height0, util_num_layers(base, 0), base->last_level+1,
-                       base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format));
-       }
-
-       if (sscreen->debug_flags & DBG(TEX)) {
-               puts("Texture:");
-               struct u_log_context log;
-               u_log_context_init(&log);
-               si_print_texture_info(sscreen, tex, &log);
-               u_log_new_page_print(&log, stdout);
-               fflush(stdout);
-               u_log_context_destroy(&log);
-       }
-
-       return tex;
+   struct si_texture *tex;
+   struct si_resource *resource;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+
+   tex = CALLOC_STRUCT(si_texture);
+   if (!tex)
+      goto error;
+
+   resource = &tex->buffer;
+   resource->b.b = *base;
+   resource->b.b.next = NULL;
+   resource->b.vtbl = &si_texture_vtbl;
+   pipe_reference_init(&resource->b.b.reference, 1);
+   resource->b.b.screen = screen;
+
+   /* don't include stencil-only formats which we don't support for rendering */
+   tex->is_depth = util_format_has_depth(util_format_description(tex->buffer.b.b.format));
+   tex->surface = *surface;
+   tex->tc_compatible_htile =
+      tex->surface.htile_size != 0 && (tex->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE);
+
+   /* TC-compatible HTILE:
+    * - GFX8 only supports Z32_FLOAT.
+    * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */
+   if (tex->tc_compatible_htile) {
+      if (sscreen->info.chip_class >= GFX9 && base->format == PIPE_FORMAT_Z16_UNORM)
+         tex->db_render_format = base->format;
+      else {
+         tex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
+         tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT &&
+                               base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT;
+      }
+   } else {
+      tex->db_render_format = base->format;
+   }
+
+   /* Applies to GCN. */
+   tex->last_msaa_resolve_target_micro_mode = tex->surface.micro_tile_mode;
+
+   /* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers
+    * between frames, so the only thing that can enable separate DCC
+    * with DRI2 is multiple slow clears within a frame.
+    */
+   tex->ps_draw_ratio = 0;
+
+   if (sscreen->info.chip_class >= GFX9) {
+      tex->surface.u.gfx9.surf_offset = offset;
+   } else {
+      for (unsigned i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i)
+         tex->surface.u.legacy.level[i].offset += offset;
+   }
+
+   if (tex->is_depth) {
+      if (sscreen->info.chip_class >= GFX9) {
+         tex->can_sample_z = true;
+         tex->can_sample_s = true;
+
+         /* Stencil texturing with HTILE doesn't work
+          * with mipmapping on Navi10-14. */
+         if ((sscreen->info.family == CHIP_NAVI10 || sscreen->info.family == CHIP_NAVI12 ||
+              sscreen->info.family == CHIP_NAVI14) &&
+             base->last_level > 0)
+            tex->htile_stencil_disabled = true;
+      } else {
+         tex->can_sample_z = !tex->surface.u.legacy.depth_adjusted;
+         tex->can_sample_s = !tex->surface.u.legacy.stencil_adjusted;
+      }
+
+      tex->db_compatible = surface->flags & RADEON_SURF_ZBUFFER;
+   } else {
+      if (tex->surface.cmask_offset) {
+         tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
+         tex->cmask_buffer = &tex->buffer;
+      }
+   }
+
+   if (plane0) {
+      /* The buffer is shared with the first plane. */
+      resource->bo_size = plane0->buffer.bo_size;
+      resource->bo_alignment = plane0->buffer.bo_alignment;
+      resource->flags = plane0->buffer.flags;
+      resource->domains = plane0->buffer.domains;
+      resource->vram_usage = plane0->buffer.vram_usage;
+      resource->gart_usage = plane0->buffer.gart_usage;
+
+      pb_reference(&resource->buf, plane0->buffer.buf);
+      resource->gpu_address = plane0->buffer.gpu_address;
+   } else if (!(surface->flags & RADEON_SURF_IMPORTED)) {
+      /* Create the backing buffer. */
+      si_init_resource_fields(sscreen, resource, alloc_size, alignment);
+
+      if (!si_alloc_resource(sscreen, resource))
+         goto error;
+   } else {
+      resource->buf = imported_buf;
+      resource->gpu_address = sscreen->ws->buffer_get_virtual_address(resource->buf);
+      resource->bo_size = imported_buf->size;
+      resource->bo_alignment = imported_buf->alignment;
+      resource->domains = sscreen->ws->buffer_get_initial_domain(resource->buf);
+      if (resource->domains & RADEON_DOMAIN_VRAM)
+         resource->vram_usage = resource->bo_size;
+      else if (resource->domains & RADEON_DOMAIN_GTT)
+         resource->gart_usage = resource->bo_size;
+   }
+
+   if (tex->cmask_buffer) {
+      /* Initialize the cmask to 0xCC (= compressed state). */
+      si_screen_clear_buffer(sscreen, &tex->cmask_buffer->b.b, tex->surface.cmask_offset,
+                             tex->surface.cmask_size, 0xCCCCCCCC);
+   }
+   if (tex->surface.htile_offset) {
+      uint32_t clear_value = 0;
+
+      if (sscreen->info.chip_class >= GFX9 || tex->tc_compatible_htile)
+         clear_value = 0x0000030F;
+
+      si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.htile_offset,
+                             tex->surface.htile_size, clear_value);
+   }
+
+   /* Initialize DCC only if the texture is not being imported. */
+   if (!(surface->flags & RADEON_SURF_IMPORTED) && tex->surface.dcc_offset) {
+      /* Clear DCC to black for all tiles with DCC enabled.
+       *
+       * This fixes corruption in 3DMark Slingshot Extreme, which
+       * uses uninitialized textures, causing corruption.
+       */
+      if (tex->surface.num_dcc_levels == tex->buffer.b.b.last_level + 1 &&
+          tex->buffer.b.b.nr_samples <= 2) {
+         /* Simple case - all tiles have DCC enabled. */
+         si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset,
+                                tex->surface.dcc_size, DCC_CLEAR_COLOR_0000);
+      } else if (sscreen->info.chip_class >= GFX9) {
+         /* Clear to uncompressed. Clearing this to black is complicated. */
+         si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset,
+                                tex->surface.dcc_size, DCC_UNCOMPRESSED);
+      } else {
+         /* GFX8: Initialize mipmap levels and multisamples separately. */
+         if (tex->buffer.b.b.nr_samples >= 2) {
+            /* Clearing this to black is complicated. */
+            si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset,
+                                   tex->surface.dcc_size, DCC_UNCOMPRESSED);
+         } else {
+            /* Clear the enabled mipmap levels to black. */
+            unsigned size = 0;
+
+            for (unsigned i = 0; i < tex->surface.num_dcc_levels; i++) {
+               if (!tex->surface.u.legacy.level[i].dcc_fast_clear_size)
+                  break;
+
+               size = tex->surface.u.legacy.level[i].dcc_offset +
+                      tex->surface.u.legacy.level[i].dcc_fast_clear_size;
+            }
+
+            /* Mipmap levels with DCC. */
+            if (size) {
+               si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset, size,
+                                      DCC_CLEAR_COLOR_0000);
+            }
+            /* Mipmap levels without DCC. */
+            if (size != tex->surface.dcc_size) {
+               si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset + size,
+                                      tex->surface.dcc_size - size, DCC_UNCOMPRESSED);
+            }
+         }
+      }
+
+      /* Initialize displayable DCC that requires the retile blit. */
+      if (tex->surface.dcc_retile_map_offset) {
+         /* Uninitialized DCC can hang the display hw.
+          * Clear to white to indicate that. */
+         si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.display_dcc_offset,
+                                tex->surface.u.gfx9.display_dcc_size, DCC_CLEAR_COLOR_1111);
+
+         /* Upload the DCC retile map.
+          * Use a staging buffer for the upload, because
+          * the buffer backing the texture is unmappable.
+          */
+         bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
+         unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
+         struct si_resource *buf = si_aligned_buffer_create(screen, 0, PIPE_USAGE_STREAM,
+                                                            num_elements * (use_uint16 ? 2 : 4),
+                                                            sscreen->info.tcc_cache_line_size);
+         uint32_t *ui = (uint32_t *)sscreen->ws->buffer_map(buf->buf, NULL, PIPE_TRANSFER_WRITE);
+         uint16_t *us = (uint16_t *)ui;
+
+         /* Upload the retile map into a staging buffer. */
+         if (use_uint16) {
+            for (unsigned i = 0; i < num_elements; i++)
+               us[i] = tex->surface.u.gfx9.dcc_retile_map[i];
+         } else {
+            for (unsigned i = 0; i < num_elements; i++)
+               ui[i] = tex->surface.u.gfx9.dcc_retile_map[i];
+         }
+
+         /* Copy the staging buffer to the buffer backing the texture. */
+         struct si_context *sctx = (struct si_context *)sscreen->aux_context;
+
+         assert(tex->surface.dcc_retile_map_offset <= UINT_MAX);
+         simple_mtx_lock(&sscreen->aux_context_lock);
+         si_sdma_copy_buffer(sctx, &tex->buffer.b.b, &buf->b.b, tex->surface.dcc_retile_map_offset,
+                             0, buf->b.b.width0);
+         sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
+         simple_mtx_unlock(&sscreen->aux_context_lock);
+
+         si_resource_reference(&buf, NULL);
+      }
+   }
+
+   /* Initialize the CMASK base register value. */
+   tex->cmask_base_address_reg = (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8;
+
+   if (sscreen->debug_flags & DBG(VM)) {
+      fprintf(stderr,
+              "VM start=0x%" PRIX64 "  end=0x%" PRIX64
+              " | Texture %ix%ix%i, %i levels, %i samples, %s\n",
+              tex->buffer.gpu_address, tex->buffer.gpu_address + tex->buffer.buf->size,
+              base->width0, base->height0, util_num_layers(base, 0), base->last_level + 1,
+              base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format));
+   }
+
+   if (sscreen->debug_flags & DBG(TEX)) {
+      puts("Texture:");
+      struct u_log_context log;
+      u_log_context_init(&log);
+      si_print_texture_info(sscreen, tex, &log);
+      u_log_new_page_print(&log, stdout);
+      fflush(stdout);
+      u_log_context_destroy(&log);
+   }
+
+   return tex;
  
  error:
-       FREE(tex);
-       if (sscreen->info.chip_class >= GFX9)
-               free(surface->u.gfx9.dcc_retile_map);
-       return NULL;
+   FREE(tex);
+   if (sscreen->info.chip_class >= GFX9)
+      free(surface->u.gfx9.dcc_retile_map);
+   return NULL;
  }
  
-static enum radeon_surf_mode
-si_choose_tiling(struct si_screen *sscreen,
-                const struct pipe_resource *templ, bool tc_compatible_htile)
+static enum radeon_surf_mode si_choose_tiling(struct si_screen *sscreen,
+                                              const struct pipe_resource *templ,
+                                              bool tc_compatible_htile)
  {
-       const struct util_format_description *desc = util_format_description(templ->format);
-       bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING;
-       bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) &&
-                               !(templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH);
-
-       /* MSAA resources must be 2D tiled. */
-       if (templ->nr_samples > 1)
-               return RADEON_SURF_MODE_2D;
-
-       /* Transfer resources should be linear. */
-       if (templ->flags & SI_RESOURCE_FLAG_TRANSFER)
-               return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-       /* Avoid Z/S decompress blits by forcing TC-compatible HTILE on GFX8,
-        * which requires 2D tiling.
-        */
-       if (sscreen->info.chip_class == GFX8 && tc_compatible_htile)
-               return RADEON_SURF_MODE_2D;
-
-       /* Handle common candidates for the linear mode.
-        * Compressed textures and DB surfaces must always be tiled.
-        */
-       if (!force_tiling &&
-           !is_depth_stencil &&
-           !util_format_is_compressed(templ->format)) {
-               if (sscreen->debug_flags & DBG(NO_TILING))
-                       return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-               /* Tiling doesn't work with the 422 (SUBSAMPLED) formats. */
-               if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED)
-                       return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-               /* Cursors are linear on AMD GCN.
-                * (XXX double-check, maybe also use RADEON_SURF_SCANOUT) */
-               if (templ->bind & PIPE_BIND_CURSOR)
-                       return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-               if (templ->bind & PIPE_BIND_LINEAR)
-                       return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-               /* Textures with a very small height are recommended to be linear. */
-               if (templ->target == PIPE_TEXTURE_1D ||
-                   templ->target == PIPE_TEXTURE_1D_ARRAY ||
-                   /* Only very thin and long 2D textures should benefit from
-                    * linear_aligned. */
-                   (templ->width0 > 8 && templ->height0 <= 2))
-                       return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-               /* Textures likely to be mapped often. */
-               if (templ->usage == PIPE_USAGE_STAGING ||
-                   templ->usage == PIPE_USAGE_STREAM)
-                       return RADEON_SURF_MODE_LINEAR_ALIGNED;
-       }
-
-       /* Make small textures 1D tiled. */
-       if (templ->width0 <= 16 || templ->height0 <= 16 ||
-           (sscreen->debug_flags & DBG(NO_2D_TILING)))
-               return RADEON_SURF_MODE_1D;
-
-       /* The allocator will switch to 1D if needed. */
-       return RADEON_SURF_MODE_2D;
+   const struct util_format_description *desc = util_format_description(templ->format);
+   bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING;
+   bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) &&
+                           !(templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH);
+
+   /* MSAA resources must be 2D tiled. */
+   if (templ->nr_samples > 1)
+      return RADEON_SURF_MODE_2D;
+
+   /* Transfer resources should be linear. */
+   if (templ->flags & SI_RESOURCE_FLAG_TRANSFER)
+      return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+   /* Avoid Z/S decompress blits by forcing TC-compatible HTILE on GFX8,
+    * which requires 2D tiling.
+    */
+   if (sscreen->info.chip_class == GFX8 && tc_compatible_htile)
+      return RADEON_SURF_MODE_2D;
+
+   /* Handle common candidates for the linear mode.
+    * Compressed textures and DB surfaces must always be tiled.
+    */
+   if (!force_tiling && !is_depth_stencil && !util_format_is_compressed(templ->format)) {
+      if (sscreen->debug_flags & DBG(NO_TILING))
+         return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+      /* Tiling doesn't work with the 422 (SUBSAMPLED) formats. */
+      if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED)
+         return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+      /* Cursors are linear on AMD GCN.
+       * (XXX double-check, maybe also use RADEON_SURF_SCANOUT) */
+      if (templ->bind & PIPE_BIND_CURSOR)
+         return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+      if (templ->bind & PIPE_BIND_LINEAR)
+         return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+      /* Textures with a very small height are recommended to be linear. */
+      if (templ->target == PIPE_TEXTURE_1D || templ->target == PIPE_TEXTURE_1D_ARRAY ||
+          /* Only very thin and long 2D textures should benefit from
+           * linear_aligned. */
+          (templ->width0 > 8 && templ->height0 <= 2))
+         return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+      /* Textures likely to be mapped often. */
+      if (templ->usage == PIPE_USAGE_STAGING || templ->usage == PIPE_USAGE_STREAM)
+         return RADEON_SURF_MODE_LINEAR_ALIGNED;
+   }
+
+   /* Make small textures 1D tiled. */
+   if (templ->width0 <= 16 || templ->height0 <= 16 || (sscreen->debug_flags & DBG(NO_2D_TILING)))
+      return RADEON_SURF_MODE_1D;
+
+   /* The allocator will switch to 1D if needed. */
+   return RADEON_SURF_MODE_2D;
  }
  
  struct pipe_resource *si_texture_create(struct pipe_screen *screen,
-                                       const struct pipe_resource *templ)
+                                        const struct pipe_resource *templ)
  {
-       struct si_screen *sscreen = (struct si_screen*)screen;
-       bool is_zs = util_format_is_depth_or_stencil(templ->format);
-
-       if (templ->nr_samples >= 2) {
-               /* This is hackish (overwriting the const pipe_resource template),
-                * but should be harmless and state trackers can also see
-                * the overriden number of samples in the created pipe_resource.
-                */
-               if (is_zs && sscreen->eqaa_force_z_samples) {
-                       ((struct pipe_resource*)templ)->nr_samples =
-                       ((struct pipe_resource*)templ)->nr_storage_samples =
-                               sscreen->eqaa_force_z_samples;
-               } else if (!is_zs && sscreen->eqaa_force_color_samples) {
-                       ((struct pipe_resource*)templ)->nr_samples =
-                               sscreen->eqaa_force_coverage_samples;
-                       ((struct pipe_resource*)templ)->nr_storage_samples =
-                               sscreen->eqaa_force_color_samples;
-               }
-       }
-
-       bool is_flushed_depth = templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH ||
-                               templ->flags & SI_RESOURCE_FLAG_TRANSFER;
-       bool tc_compatible_htile =
-               sscreen->info.chip_class >= GFX8 &&
-               /* There are issues with TC-compatible HTILE on Tonga (and
-                * Iceland is the same design), and documented bug workarounds
-                * don't help. For example, this fails:
-                *   piglit/bin/tex-miplevel-selection 'texture()' 2DShadow -auto
-                */
-               sscreen->info.family != CHIP_TONGA &&
-               sscreen->info.family != CHIP_ICELAND &&
-               (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
-               !(sscreen->debug_flags & DBG(NO_HYPERZ)) &&
-               !is_flushed_depth &&
-               templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
-               is_zs;
-       enum radeon_surf_mode tile_mode = si_choose_tiling(sscreen, templ,
-                                                          tc_compatible_htile);
-
-       /* This allocates textures with multiple planes like NV12 in 1 buffer. */
-       enum { SI_TEXTURE_MAX_PLANES = 3 };
-       struct radeon_surf surface[SI_TEXTURE_MAX_PLANES] = {};
-       struct pipe_resource plane_templ[SI_TEXTURE_MAX_PLANES];
-       uint64_t plane_offset[SI_TEXTURE_MAX_PLANES] = {};
-       uint64_t total_size = 0;
-       unsigned max_alignment = 0;
-       unsigned num_planes = util_format_get_num_planes(templ->format);
-       assert(num_planes <= SI_TEXTURE_MAX_PLANES);
-
-       /* Compute texture or plane layouts and offsets. */
-       for (unsigned i = 0; i < num_planes; i++) {
-               plane_templ[i] = *templ;
-               plane_templ[i].format = util_format_get_plane_format(templ->format, i);
-               plane_templ[i].width0 = util_format_get_plane_width(templ->format, i, templ->width0);
-               plane_templ[i].height0 = util_format_get_plane_height(templ->format, i, templ->height0);
-
-               /* Multi-plane allocations need PIPE_BIND_SHARED, because we can't
-                * reallocate the storage to add PIPE_BIND_SHARED, because it's
-                * shared by 3 pipe_resources.
-                */
-               if (num_planes > 1)
-                       plane_templ[i].bind |= PIPE_BIND_SHARED;
-
-               if (si_init_surface(sscreen, &surface[i], &plane_templ[i],
-                                   tile_mode, 0, false,
-                                   plane_templ[i].bind & PIPE_BIND_SCANOUT,
-                                   is_flushed_depth, tc_compatible_htile))
-                       return NULL;
-
-               plane_offset[i] = align64(total_size, surface[i].surf_alignment);
-               total_size = plane_offset[i] + surface[i].total_size;
-               max_alignment = MAX2(max_alignment, surface[i].surf_alignment);
-       }
-
-       struct si_texture *plane0 = NULL, *last_plane = NULL;
-
-       for (unsigned i = 0; i < num_planes; i++) {
-               struct si_texture *tex =
-                       si_texture_create_object(screen, &plane_templ[i], &surface[i],
-                                                plane0, NULL, plane_offset[i],
-                                                total_size, max_alignment);
-               if (!tex) {
-                       si_texture_reference(&plane0, NULL);
-                       return NULL;
-               }
-
-               tex->plane_index = i;
-               tex->num_planes = num_planes;
-
-               if (!plane0) {
-                       plane0 = last_plane = tex;
-               } else {
-                       last_plane->buffer.b.b.next = &tex->buffer.b.b;
-                       last_plane = tex;
-               }
-       }
-
-       return (struct pipe_resource *)plane0;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   bool is_zs = util_format_is_depth_or_stencil(templ->format);
+
+   if (templ->nr_samples >= 2) {
+      /* This is hackish (overwriting the const pipe_resource template),
+       * but should be harmless and state trackers can also see
+       * the overriden number of samples in the created pipe_resource.
+       */
+      if (is_zs && sscreen->eqaa_force_z_samples) {
+         ((struct pipe_resource *)templ)->nr_samples =
+            ((struct pipe_resource *)templ)->nr_storage_samples = sscreen->eqaa_force_z_samples;
+      } else if (!is_zs && sscreen->eqaa_force_color_samples) {
+         ((struct pipe_resource *)templ)->nr_samples = sscreen->eqaa_force_coverage_samples;
+         ((struct pipe_resource *)templ)->nr_storage_samples = sscreen->eqaa_force_color_samples;
+      }
+   }
+
+   bool is_flushed_depth =
+      templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH || templ->flags & SI_RESOURCE_FLAG_TRANSFER;
+   bool tc_compatible_htile =
+      sscreen->info.chip_class >= GFX8 &&
+      /* There are issues with TC-compatible HTILE on Tonga (and
+       * Iceland is the same design), and documented bug workarounds
+       * don't help. For example, this fails:
+       *   piglit/bin/tex-miplevel-selection 'texture()' 2DShadow -auto
+       */
+      sscreen->info.family != CHIP_TONGA && sscreen->info.family != CHIP_ICELAND &&
+      (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
+      !(sscreen->debug_flags & DBG(NO_HYPERZ)) && !is_flushed_depth &&
+      templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
+      is_zs;
+   enum radeon_surf_mode tile_mode = si_choose_tiling(sscreen, templ, tc_compatible_htile);
+
+   /* This allocates textures with multiple planes like NV12 in 1 buffer. */
+   enum
+   {
+      SI_TEXTURE_MAX_PLANES = 3
+   };
+   struct radeon_surf surface[SI_TEXTURE_MAX_PLANES] = {};
+   struct pipe_resource plane_templ[SI_TEXTURE_MAX_PLANES];
+   uint64_t plane_offset[SI_TEXTURE_MAX_PLANES] = {};
+   uint64_t total_size = 0;
+   unsigned max_alignment = 0;
+   unsigned num_planes = util_format_get_num_planes(templ->format);
+   assert(num_planes <= SI_TEXTURE_MAX_PLANES);
+
+   /* Compute texture or plane layouts and offsets. */
+   for (unsigned i = 0; i < num_planes; i++) {
+      plane_templ[i] = *templ;
+      plane_templ[i].format = util_format_get_plane_format(templ->format, i);
+      plane_templ[i].width0 = util_format_get_plane_width(templ->format, i, templ->width0);
+      plane_templ[i].height0 = util_format_get_plane_height(templ->format, i, templ->height0);
+
+      /* Multi-plane allocations need PIPE_BIND_SHARED, because we can't
+       * reallocate the storage to add PIPE_BIND_SHARED, because it's
+       * shared by 3 pipe_resources.
+       */
+      if (num_planes > 1)
+         plane_templ[i].bind |= PIPE_BIND_SHARED;
+
+      if (si_init_surface(sscreen, &surface[i], &plane_templ[i], tile_mode, 0, false,
+                          plane_templ[i].bind & PIPE_BIND_SCANOUT, is_flushed_depth,
+                          tc_compatible_htile))
+         return NULL;
+
+      plane_offset[i] = align64(total_size, surface[i].surf_alignment);
+      total_size = plane_offset[i] + surface[i].total_size;
+      max_alignment = MAX2(max_alignment, surface[i].surf_alignment);
+   }
+
+   struct si_texture *plane0 = NULL, *last_plane = NULL;
+
+   for (unsigned i = 0; i < num_planes; i++) {
+      struct si_texture *tex =
+         si_texture_create_object(screen, &plane_templ[i], &surface[i], plane0, NULL,
+                                  plane_offset[i], total_size, max_alignment);
+      if (!tex) {
+         si_texture_reference(&plane0, NULL);
+         return NULL;
+      }
+
+      tex->plane_index = i;
+      tex->num_planes = num_planes;
+
+      if (!plane0) {
+         plane0 = last_plane = tex;
+      } else {
+         last_plane->buffer.b.b.next = &tex->buffer.b.b;
+         last_plane = tex;
+      }
+   }
+
+   return (struct pipe_resource *)plane0;
  }
  
  static struct pipe_resource *si_texture_from_winsys_buffer(struct si_screen *sscreen,
-                                                          const struct pipe_resource *templ,
-                                                          struct pb_buffer *buf,
-                                                          unsigned stride,
-                                                          unsigned offset,
-                                                          unsigned usage,
-                                                          bool dedicated)
+                                                           const struct pipe_resource *templ,
+                                                           struct pb_buffer *buf, unsigned stride,
+                                                           unsigned offset, unsigned usage,
+                                                           bool dedicated)
  {
-       enum radeon_surf_mode array_mode;
-       struct radeon_surf surface = {};
-       struct radeon_bo_metadata metadata = {};
-       struct si_texture *tex;
-       bool is_scanout;
-       int r;
-
-       /* Ignore metadata for non-zero planes. */
-       if (offset != 0)
-               dedicated = false;
-
-       if (dedicated) {
-               sscreen->ws->buffer_get_metadata(buf, &metadata);
-               si_get_display_metadata(sscreen, &surface, &metadata,
-                                       &array_mode, &is_scanout);
-       } else {
-               /**
-                * The bo metadata is unset for un-dedicated images. So we fall
-                * back to linear. See answer to question 5 of the
-                * VK_KHX_external_memory spec for some details.
-                *
-                * It is possible that this case isn't going to work if the
-                * surface pitch isn't correctly aligned by default.
-                *
-                * In order to support it correctly we require multi-image
-                * metadata to be syncrhonized between radv and radeonsi. The
-                * semantics of associating multiple image metadata to a memory
-                * object on the vulkan export side are not concretely defined
-                * either.
-                *
-                * All the use cases we are aware of at the moment for memory
-                * objects use dedicated allocations. So lets keep the initial
-                * implementation simple.
-                *
-                * A possible alternative is to attempt to reconstruct the
-                * tiling information when the TexParameter TEXTURE_TILING_EXT
-                * is set.
-                */
-               array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
-               is_scanout = false;
-       }
-
-       r = si_init_surface(sscreen, &surface, templ,
-                           array_mode, stride, true, is_scanout,
-                           false, false);
-       if (r)
-               return NULL;
-
-       tex = si_texture_create_object(&sscreen->b, templ, &surface, NULL, buf,
-                                      offset, 0, 0);
-       if (!tex)
-               return NULL;
-
-       tex->buffer.b.is_shared = true;
-       tex->buffer.external_usage = usage;
-       tex->num_planes = 1;
-
-       if (!si_read_tex_bo_metadata(sscreen, tex, offset, &metadata)) {
-               si_texture_reference(&tex, NULL);
-               return NULL;
-       }
-
-       /* Displayable DCC requires an explicit flush. */
-       if (dedicated && offset == 0 &&
-           !(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
-           si_has_displayable_dcc(tex)) {
-               /* TODO: do we need to decompress DCC? */
-               if (si_texture_discard_dcc(sscreen, tex)) {
-                       /* Update BO metadata after disabling DCC. */
-                       si_set_tex_bo_metadata(sscreen, tex);
-               }
-       }
-
-       assert(tex->surface.tile_swizzle == 0);
-       return &tex->buffer.b.b;
+   enum radeon_surf_mode array_mode;
+   struct radeon_surf surface = {};
+   struct radeon_bo_metadata metadata = {};
+   struct si_texture *tex;
+   bool is_scanout;
+   int r;
+
+   /* Ignore metadata for non-zero planes. */
+   if (offset != 0)
+      dedicated = false;
+
+   if (dedicated) {
+      sscreen->ws->buffer_get_metadata(buf, &metadata);
+      si_get_display_metadata(sscreen, &surface, &metadata, &array_mode, &is_scanout);
+   } else {
+      /**
+       * The bo metadata is unset for un-dedicated images. So we fall
+       * back to linear. See answer to question 5 of the
+       * VK_KHX_external_memory spec for some details.
+       *
+       * It is possible that this case isn't going to work if the
+       * surface pitch isn't correctly aligned by default.
+       *
+       * In order to support it correctly we require multi-image
+       * metadata to be syncrhonized between radv and radeonsi. The
+       * semantics of associating multiple image metadata to a memory
+       * object on the vulkan export side are not concretely defined
+       * either.
+       *
+       * All the use cases we are aware of at the moment for memory
+       * objects use dedicated allocations. So lets keep the initial
+       * implementation simple.
+       *
+       * A possible alternative is to attempt to reconstruct the
+       * tiling information when the TexParameter TEXTURE_TILING_EXT
+       * is set.
+       */
+      array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+      is_scanout = false;
+   }
+
+   r =
+      si_init_surface(sscreen, &surface, templ, array_mode, stride, true, is_scanout, false, false);
+   if (r)
+      return NULL;
+
+   tex = si_texture_create_object(&sscreen->b, templ, &surface, NULL, buf, offset, 0, 0);
+   if (!tex)
+      return NULL;
+
+   tex->buffer.b.is_shared = true;
+   tex->buffer.external_usage = usage;
+   tex->num_planes = 1;
+
+   if (!si_read_tex_bo_metadata(sscreen, tex, offset, &metadata)) {
+      si_texture_reference(&tex, NULL);
+      return NULL;
+   }
+
+   /* Displayable DCC requires an explicit flush. */
+   if (dedicated && offset == 0 && !(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
+       si_has_displayable_dcc(tex)) {
+      /* TODO: do we need to decompress DCC? */
+      if (si_texture_discard_dcc(sscreen, tex)) {
+         /* Update BO metadata after disabling DCC. */
+         si_set_tex_bo_metadata(sscreen, tex);
+      }
+   }
+
+   assert(tex->surface.tile_swizzle == 0);
+   return &tex->buffer.b.b;
  }
  
  static struct pipe_resource *si_texture_from_handle(struct pipe_screen *screen,
-                                                   const struct pipe_resource *templ,
-                                                   struct winsys_handle *whandle,
-                                                   unsigned usage)
+                                                    const struct pipe_resource *templ,
+                                                    struct winsys_handle *whandle, unsigned usage)
  {
-       struct si_screen *sscreen = (struct si_screen*)screen;
-       struct pb_buffer *buf = NULL;
-
-       /* Support only 2D textures without mipmaps */
-       if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT &&
-            templ->target != PIPE_TEXTURE_2D_ARRAY) ||
-             templ->last_level != 0)
-               return NULL;
-
-       buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle,
-                                             sscreen->info.max_alignment);
-       if (!buf)
-               return NULL;
-
-       return si_texture_from_winsys_buffer(sscreen, templ, buf,
-                                            whandle->stride, whandle->offset,
-                                            usage, true);
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct pb_buffer *buf = NULL;
+
+   /* Support only 2D textures without mipmaps */
+   if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT &&
+        templ->target != PIPE_TEXTURE_2D_ARRAY) ||
+       templ->last_level != 0)
+      return NULL;
+
+   buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, sscreen->info.max_alignment);
+   if (!buf)
+      return NULL;
+
+   return si_texture_from_winsys_buffer(sscreen, templ, buf, whandle->stride, whandle->offset,
+                                        usage, true);
  }
  
-bool si_init_flushed_depth_texture(struct pipe_context *ctx,
-                                  struct pipe_resource *texture)
+bool si_init_flushed_depth_texture(struct pipe_context *ctx, struct pipe_resource *texture)
  {
-       struct si_texture *tex = (struct si_texture*)texture;
-       struct pipe_resource resource;
-       enum pipe_format pipe_format = texture->format;
-
-       assert(!tex->flushed_depth_texture);
-
-       if (!tex->can_sample_z && tex->can_sample_s) {
-               switch (pipe_format) {
-               case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-                       /* Save memory by not allocating the S plane. */
-                       pipe_format = PIPE_FORMAT_Z32_FLOAT;
-                       break;
-               case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-               case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-                       /* Save memory bandwidth by not copying the
-                        * stencil part during flush.
-                        *
-                        * This potentially increases memory bandwidth
-                        * if an application uses both Z and S texturing
-                        * simultaneously (a flushed Z24S8 texture
-                        * would be stored compactly), but how often
-                        * does that really happen?
-                        */
-                       pipe_format = PIPE_FORMAT_Z24X8_UNORM;
-                       break;
-               default:;
-               }
-       } else if (!tex->can_sample_s && tex->can_sample_z) {
-               assert(util_format_has_stencil(util_format_description(pipe_format)));
-
-               /* DB->CB copies to an 8bpp surface don't work. */
-               pipe_format = PIPE_FORMAT_X24S8_UINT;
-       }
-
-       memset(&resource, 0, sizeof(resource));
-       resource.target = texture->target;
-       resource.format = pipe_format;
-       resource.width0 = texture->width0;
-       resource.height0 = texture->height0;
-       resource.depth0 = texture->depth0;
-       resource.array_size = texture->array_size;
-       resource.last_level = texture->last_level;
-       resource.nr_samples = texture->nr_samples;
-       resource.usage = PIPE_USAGE_DEFAULT;
-       resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL;
-       resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH;
-
-       tex->flushed_depth_texture = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource);
-       if (!tex->flushed_depth_texture) {
-               PRINT_ERR("failed to create temporary texture to hold flushed depth\n");
-               return false;
-       }
-       return true;
+   struct si_texture *tex = (struct si_texture *)texture;
+   struct pipe_resource resource;
+   enum pipe_format pipe_format = texture->format;
+
+   assert(!tex->flushed_depth_texture);
+
+   if (!tex->can_sample_z && tex->can_sample_s) {
+      switch (pipe_format) {
+      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+         /* Save memory by not allocating the S plane. */
+         pipe_format = PIPE_FORMAT_Z32_FLOAT;
+         break;
+      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+         /* Save memory bandwidth by not copying the
+          * stencil part during flush.
+          *
+          * This potentially increases memory bandwidth
+          * if an application uses both Z and S texturing
+          * simultaneously (a flushed Z24S8 texture
+          * would be stored compactly), but how often
+          * does that really happen?
+          */
+         pipe_format = PIPE_FORMAT_Z24X8_UNORM;
+         break;
+      default:;
+      }
+   } else if (!tex->can_sample_s && tex->can_sample_z) {
+      assert(util_format_has_stencil(util_format_description(pipe_format)));
+
+      /* DB->CB copies to an 8bpp surface don't work. */
+      pipe_format = PIPE_FORMAT_X24S8_UINT;
+   }
+
+   memset(&resource, 0, sizeof(resource));
+   resource.target = texture->target;
+   resource.format = pipe_format;
+   resource.width0 = texture->width0;
+   resource.height0 = texture->height0;
+   resource.depth0 = texture->depth0;
+   resource.array_size = texture->array_size;
+   resource.last_level = texture->last_level;
+   resource.nr_samples = texture->nr_samples;
+   resource.usage = PIPE_USAGE_DEFAULT;
+   resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL;
+   resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH;
+
+   tex->flushed_depth_texture =
+      (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource);
+   if (!tex->flushed_depth_texture) {
+      PRINT_ERR("failed to create temporary texture to hold flushed depth\n");
+      return false;
+   }
+   return true;
  }
  
  /**
@@ -1905,836 +1763,759 @@ bool si_init_flushed_depth_texture(struct pipe_context *ctx,
   * which is supposed to hold a subregion of the texture "orig" at the given
   * mipmap level.
   */
-static void si_init_temp_resource_from_box(struct pipe_resource *res,
-                                          struct pipe_resource *orig,
-                                          const struct pipe_box *box,
-                                          unsigned level, unsigned flags)
+static void si_init_temp_resource_from_box(struct pipe_resource *res, struct pipe_resource *orig,
+                                           const struct pipe_box *box, unsigned level,
+                                           unsigned flags)
  {
-       memset(res, 0, sizeof(*res));
-       res->format = orig->format;
-       res->width0 = box->width;
-       res->height0 = box->height;
-       res->depth0 = 1;
-       res->array_size = 1;
-       res->usage = flags & SI_RESOURCE_FLAG_TRANSFER ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT;
-       res->flags = flags;
-
-       if (flags & SI_RESOURCE_FLAG_TRANSFER &&
-           util_format_is_compressed(orig->format)) {
-               /* Transfer resources are allocated with linear tiling, which is
-                * not supported for compressed formats.
-                */
-               unsigned blocksize =
-                       util_format_get_blocksize(orig->format);
-
-               if (blocksize == 8) {
-                       res->format = PIPE_FORMAT_R16G16B16A16_UINT;
-               } else {
-                       assert(blocksize == 16);
-                       res->format = PIPE_FORMAT_R32G32B32A32_UINT;
-               }
-
-               res->width0 = util_format_get_nblocksx(orig->format, box->width);
-               res->height0 = util_format_get_nblocksy(orig->format, box->height);
-       }
-
-       /* We must set the correct texture target and dimensions for a 3D box. */
-       if (box->depth > 1 && util_max_layer(orig, level) > 0) {
-               res->target = PIPE_TEXTURE_2D_ARRAY;
-               res->array_size = box->depth;
-       } else {
-               res->target = PIPE_TEXTURE_2D;
-       }
+   memset(res, 0, sizeof(*res));
+   res->format = orig->format;
+   res->width0 = box->width;
+   res->height0 = box->height;
+   res->depth0 = 1;
+   res->array_size = 1;
+   res->usage = flags & SI_RESOURCE_FLAG_TRANSFER ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT;
+   res->flags = flags;
+
+   if (flags & SI_RESOURCE_FLAG_TRANSFER && util_format_is_compressed(orig->format)) {
+      /* Transfer resources are allocated with linear tiling, which is
+       * not supported for compressed formats.
+       */
+      unsigned blocksize = util_format_get_blocksize(orig->format);
+
+      if (blocksize == 8) {
+         res->format = PIPE_FORMAT_R16G16B16A16_UINT;
+      } else {
+         assert(blocksize == 16);
+         res->format = PIPE_FORMAT_R32G32B32A32_UINT;
+      }
+
+      res->width0 = util_format_get_nblocksx(orig->format, box->width);
+      res->height0 = util_format_get_nblocksy(orig->format, box->height);
+   }
+
+   /* We must set the correct texture target and dimensions for a 3D box. */
+   if (box->depth > 1 && util_max_layer(orig, level) > 0) {
+      res->target = PIPE_TEXTURE_2D_ARRAY;
+      res->array_size = box->depth;
+   } else {
+      res->target = PIPE_TEXTURE_2D;
+   }
  }
  
-static bool si_can_invalidate_texture(struct si_screen *sscreen,
-                                     struct si_texture *tex,
-                                     unsigned transfer_usage,
-                                     const struct pipe_box *box)
+static bool si_can_invalidate_texture(struct si_screen *sscreen, struct si_texture *tex,
+                                      unsigned transfer_usage, const struct pipe_box *box)
  {
-       return !tex->buffer.b.is_shared &&
-               !(tex->surface.flags & RADEON_SURF_IMPORTED) &&
-               !(transfer_usage & PIPE_TRANSFER_READ) &&
-               tex->buffer.b.b.last_level == 0 &&
-               util_texrange_covers_whole_level(&tex->buffer.b.b, 0,
-                                                box->x, box->y, box->z,
-                                                box->width, box->height,
-                                                box->depth);
+   return !tex->buffer.b.is_shared && !(tex->surface.flags & RADEON_SURF_IMPORTED) &&
+          !(transfer_usage & PIPE_TRANSFER_READ) && tex->buffer.b.b.last_level == 0 &&
+          util_texrange_covers_whole_level(&tex->buffer.b.b, 0, box->x, box->y, box->z, box->width,
+                                           box->height, box->depth);
  }
  
-static void si_texture_invalidate_storage(struct si_context *sctx,
-                                         struct si_texture *tex)
+static void si_texture_invalidate_storage(struct si_context *sctx, struct si_texture *tex)
  {
-       struct si_screen *sscreen = sctx->screen;
+   struct si_screen *sscreen = sctx->screen;
  
-       /* There is no point in discarding depth and tiled buffers. */
-       assert(!tex->is_depth);
-       assert(tex->surface.is_linear);
+   /* There is no point in discarding depth and tiled buffers. */
+   assert(!tex->is_depth);
+   assert(tex->surface.is_linear);
  
-       /* Reallocate the buffer in the same pipe_resource. */
-       si_alloc_resource(sscreen, &tex->buffer);
+   /* Reallocate the buffer in the same pipe_resource. */
+   si_alloc_resource(sscreen, &tex->buffer);
  
-       /* Initialize the CMASK base address (needed even without CMASK). */
-       tex->cmask_base_address_reg =
-               (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8;
+   /* Initialize the CMASK base address (needed even without CMASK). */
+   tex->cmask_base_address_reg = (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8;
  
-       p_atomic_inc(&sscreen->dirty_tex_counter);
+   p_atomic_inc(&sscreen->dirty_tex_counter);
  
-       sctx->num_alloc_tex_transfer_bytes += tex->surface.total_size;
+   sctx->num_alloc_tex_transfer_bytes += tex->surface.total_size;
  }
  
-static void *si_texture_transfer_map(struct pipe_context *ctx,
-                                    struct pipe_resource *texture,
-                                    unsigned level,
-                                    unsigned usage,
-                                    const struct pipe_box *box,
-                                    struct pipe_transfer **ptransfer)
+static void *si_texture_transfer_map(struct pipe_context *ctx, struct pipe_resource *texture,
+                                     unsigned level, unsigned usage, const struct pipe_box *box,
+                                     struct pipe_transfer **ptransfer)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_texture *tex = (struct si_texture*)texture;
-       struct si_transfer *trans;
-       struct si_resource *buf;
-       unsigned offset = 0;
-       char *map;
-       bool use_staging_texture = false;
-
-       assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER));
-       assert(box->width && box->height && box->depth);
-
-       if (tex->is_depth) {
-               /* Depth textures use staging unconditionally. */
-               use_staging_texture = true;
-       } else {
-               /* Degrade the tile mode if we get too many transfers on APUs.
-                * On dGPUs, the staging texture is always faster.
-                * Only count uploads that are at least 4x4 pixels large.
-                */
-               if (!sctx->screen->info.has_dedicated_vram &&
-                   level == 0 &&
-                   box->width >= 4 && box->height >= 4 &&
-                   p_atomic_inc_return(&tex->num_level0_transfers) == 10) {
-                       bool can_invalidate =
-                               si_can_invalidate_texture(sctx->screen, tex,
-                                                           usage, box);
-
-                       si_reallocate_texture_inplace(sctx, tex,
-                                                       PIPE_BIND_LINEAR,
-                                                       can_invalidate);
-               }
-
-               /* Tiled textures need to be converted into a linear texture for CPU
-                * access. The staging texture is always linear and is placed in GART.
-                *
-                * Reading from VRAM or GTT WC is slow, always use the staging
-                * texture in this case.
-                *
-                * Use the staging texture for uploads if the underlying BO
-                * is busy.
-                */
-               if (!tex->surface.is_linear)
-                       use_staging_texture = true;
-               else if (usage & PIPE_TRANSFER_READ)
-                       use_staging_texture =
-                               tex->buffer.domains & RADEON_DOMAIN_VRAM ||
-                               tex->buffer.flags & RADEON_FLAG_GTT_WC;
-               /* Write & linear only: */
-               else if (si_rings_is_buffer_referenced(sctx, tex->buffer.buf,
-                                                      RADEON_USAGE_READWRITE) ||
-                        !sctx->ws->buffer_wait(tex->buffer.buf, 0,
-                                               RADEON_USAGE_READWRITE)) {
-                       /* It's busy. */
-                       if (si_can_invalidate_texture(sctx->screen, tex,
-                                                       usage, box))
-                               si_texture_invalidate_storage(sctx, tex);
-                       else
-                               use_staging_texture = true;
-               }
-       }
-
-       trans = CALLOC_STRUCT(si_transfer);
-       if (!trans)
-               return NULL;
-       pipe_resource_reference(&trans->b.b.resource, texture);
-       trans->b.b.level = level;
-       trans->b.b.usage = usage;
-       trans->b.b.box = *box;
-
-       if (use_staging_texture) {
-               struct pipe_resource resource;
-               struct si_texture *staging;
-
-               si_init_temp_resource_from_box(&resource, texture, box, level,
-                                                SI_RESOURCE_FLAG_TRANSFER);
-               resource.usage = (usage & PIPE_TRANSFER_READ) ?
-                       PIPE_USAGE_STAGING : PIPE_USAGE_STREAM;
-
-               /* Since depth-stencil textures don't support linear tiling,
-                * blit from ZS to color and vice versa. u_blitter will do
-                * the packing for these formats.
-                */
-               if (tex->is_depth)
-                       resource.format = util_blitter_get_color_format_for_zs(resource.format);
-
-               /* Create the temporary texture. */
-               staging = (struct si_texture*)ctx->screen->resource_create(ctx->screen, &resource);
-               if (!staging) {
-                       PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
-                       goto fail_trans;
-               }
-               trans->staging = &staging->buffer;
-
-               /* Just get the strides. */
-               si_texture_get_offset(sctx->screen, staging, 0, NULL,
-                                       &trans->b.b.stride,
-                                       &trans->b.b.layer_stride);
-
-               if (usage & PIPE_TRANSFER_READ)
-                       si_copy_to_staging_texture(ctx, trans);
-               else
-                       usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
-
-               buf = trans->staging;
-       } else {
-               /* the resource is mapped directly */
-               offset = si_texture_get_offset(sctx->screen, tex, level, box,
-                                                &trans->b.b.stride,
-                                                &trans->b.b.layer_stride);
-               buf = &tex->buffer;
-       }
-
-       /* Always unmap texture CPU mappings on 32-bit architectures, so that
-        * we don't run out of the CPU address space.
-        */
-       if (sizeof(void*) == 4)
-               usage |= RADEON_TRANSFER_TEMPORARY;
-
-       if (!(map = si_buffer_map_sync_with_rings(sctx, buf, usage)))
-               goto fail_trans;
-
-       *ptransfer = &trans->b.b;
-       return map + offset;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *tex = (struct si_texture *)texture;
+   struct si_transfer *trans;
+   struct si_resource *buf;
+   unsigned offset = 0;
+   char *map;
+   bool use_staging_texture = false;
+
+   assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER));
+   assert(box->width && box->height && box->depth);
+
+   if (tex->is_depth) {
+      /* Depth textures use staging unconditionally. */
+      use_staging_texture = true;
+   } else {
+      /* Degrade the tile mode if we get too many transfers on APUs.
+       * On dGPUs, the staging texture is always faster.
+       * Only count uploads that are at least 4x4 pixels large.
+       */
+      if (!sctx->screen->info.has_dedicated_vram && level == 0 && box->width >= 4 &&
+          box->height >= 4 && p_atomic_inc_return(&tex->num_level0_transfers) == 10) {
+         bool can_invalidate = si_can_invalidate_texture(sctx->screen, tex, usage, box);
+
+         si_reallocate_texture_inplace(sctx, tex, PIPE_BIND_LINEAR, can_invalidate);
+      }
+
+      /* Tiled textures need to be converted into a linear texture for CPU
+       * access. The staging texture is always linear and is placed in GART.
+       *
+       * Reading from VRAM or GTT WC is slow, always use the staging
+       * texture in this case.
+       *
+       * Use the staging texture for uploads if the underlying BO
+       * is busy.
+       */
+      if (!tex->surface.is_linear)
+         use_staging_texture = true;
+      else if (usage & PIPE_TRANSFER_READ)
+         use_staging_texture =
+            tex->buffer.domains & RADEON_DOMAIN_VRAM || tex->buffer.flags & RADEON_FLAG_GTT_WC;
+      /* Write & linear only: */
+      else if (si_rings_is_buffer_referenced(sctx, tex->buffer.buf, RADEON_USAGE_READWRITE) ||
+               !sctx->ws->buffer_wait(tex->buffer.buf, 0, RADEON_USAGE_READWRITE)) {
+         /* It's busy. */
+         if (si_can_invalidate_texture(sctx->screen, tex, usage, box))
+            si_texture_invalidate_storage(sctx, tex);
+         else
+            use_staging_texture = true;
+      }
+   }
+
+   trans = CALLOC_STRUCT(si_transfer);
+   if (!trans)
+      return NULL;
+   pipe_resource_reference(&trans->b.b.resource, texture);
+   trans->b.b.level = level;
+   trans->b.b.usage = usage;
+   trans->b.b.box = *box;
+
+   if (use_staging_texture) {
+      struct pipe_resource resource;
+      struct si_texture *staging;
+
+      si_init_temp_resource_from_box(&resource, texture, box, level, SI_RESOURCE_FLAG_TRANSFER);
+      resource.usage = (usage & PIPE_TRANSFER_READ) ? PIPE_USAGE_STAGING : PIPE_USAGE_STREAM;
+
+      /* Since depth-stencil textures don't support linear tiling,
+       * blit from ZS to color and vice versa. u_blitter will do
+       * the packing for these formats.
+       */
+      if (tex->is_depth)
+         resource.format = util_blitter_get_color_format_for_zs(resource.format);
+
+      /* Create the temporary texture. */
+      staging = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource);
+      if (!staging) {
+         PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
+         goto fail_trans;
+      }
+      trans->staging = &staging->buffer;
+
+      /* Just get the strides. */
+      si_texture_get_offset(sctx->screen, staging, 0, NULL, &trans->b.b.stride,
+                            &trans->b.b.layer_stride);
+
+      if (usage & PIPE_TRANSFER_READ)
+         si_copy_to_staging_texture(ctx, trans);
+      else
+         usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+
+      buf = trans->staging;
+   } else {
+      /* the resource is mapped directly */
+      offset = si_texture_get_offset(sctx->screen, tex, level, box, &trans->b.b.stride,
+                                     &trans->b.b.layer_stride);
+      buf = &tex->buffer;
+   }
+
+   /* Always unmap texture CPU mappings on 32-bit architectures, so that
+    * we don't run out of the CPU address space.
+    */
+   if (sizeof(void *) == 4)
+      usage |= RADEON_TRANSFER_TEMPORARY;
+
+   if (!(map = si_buffer_map_sync_with_rings(sctx, buf, usage)))
+      goto fail_trans;
+
+   *ptransfer = &trans->b.b;
+   return map + offset;
  
  fail_trans:
-       si_resource_reference(&trans->staging, NULL);
-       pipe_resource_reference(&trans->b.b.resource, NULL);
-       FREE(trans);
-       return NULL;
+   si_resource_reference(&trans->staging, NULL);
+   pipe_resource_reference(&trans->b.b.resource, NULL);
+   FREE(trans);
+   return NULL;
  }
  
-static void si_texture_transfer_unmap(struct pipe_context *ctx,
-                                     struct pipe_transfer* transfer)
+static void si_texture_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer *transfer)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct si_transfer *stransfer = (struct si_transfer*)transfer;
-       struct pipe_resource *texture = transfer->resource;
-       struct si_texture *tex = (struct si_texture*)texture;
-
-       /* Always unmap texture CPU mappings on 32-bit architectures, so that
-        * we don't run out of the CPU address space.
-        */
-       if (sizeof(void*) == 4) {
-               struct si_resource *buf =
-                       stransfer->staging ? stransfer->staging : &tex->buffer;
-
-               sctx->ws->buffer_unmap(buf->buf);
-       }
-
-       if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging)
-               si_copy_from_staging_texture(ctx, stransfer);
-
-       if (stransfer->staging) {
-               sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size;
-               si_resource_reference(&stransfer->staging, NULL);
-       }
-
-       /* Heuristic for {upload, draw, upload, draw, ..}:
-        *
-        * Flush the gfx IB if we've allocated too much texture storage.
-        *
-        * The idea is that we don't want to build IBs that use too much
-        * memory and put pressure on the kernel memory manager and we also
-        * want to make temporary and invalidated buffers go idle ASAP to
-        * decrease the total memory usage or make them reusable. The memory
-        * usage will be slightly higher than given here because of the buffer
-        * cache in the winsys.
-        *
-        * The result is that the kernel memory manager is never a bottleneck.
-        */
-       if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) {
-               si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-               sctx->num_alloc_tex_transfer_bytes = 0;
-       }
-
-       pipe_resource_reference(&transfer->resource, NULL);
-       FREE(transfer);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_transfer *stransfer = (struct si_transfer *)transfer;
+   struct pipe_resource *texture = transfer->resource;
+   struct si_texture *tex = (struct si_texture *)texture;
+
+   /* Always unmap texture CPU mappings on 32-bit architectures, so that
+    * we don't run out of the CPU address space.
+    */
+   if (sizeof(void *) == 4) {
+      struct si_resource *buf = stransfer->staging ? stransfer->staging : &tex->buffer;
+
+      sctx->ws->buffer_unmap(buf->buf);
+   }
+
+   if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging)
+      si_copy_from_staging_texture(ctx, stransfer);
+
+   if (stransfer->staging) {
+      sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size;
+      si_resource_reference(&stransfer->staging, NULL);
+   }
+
+   /* Heuristic for {upload, draw, upload, draw, ..}:
+    *
+    * Flush the gfx IB if we've allocated too much texture storage.
+    *
+    * The idea is that we don't want to build IBs that use too much
+    * memory and put pressure on the kernel memory manager and we also
+    * want to make temporary and invalidated buffers go idle ASAP to
+    * decrease the total memory usage or make them reusable. The memory
+    * usage will be slightly higher than given here because of the buffer
+    * cache in the winsys.
+    *
+    * The result is that the kernel memory manager is never a bottleneck.
+    */
+   if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) {
+      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+      sctx->num_alloc_tex_transfer_bytes = 0;
+   }
+
+   pipe_resource_reference(&transfer->resource, NULL);
+   FREE(transfer);
  }
  
-static const struct u_resource_vtbl si_texture_vtbl =
-{
-       NULL,                           /* get_handle */
-       si_texture_destroy,             /* resource_destroy */
-       si_texture_transfer_map,        /* transfer_map */
-       u_default_transfer_flush_region, /* transfer_flush_region */
-       si_texture_transfer_unmap,      /* transfer_unmap */
+static const struct u_resource_vtbl si_texture_vtbl = {
+   NULL,                            /* get_handle */
+   si_texture_destroy,              /* resource_destroy */
+   si_texture_transfer_map,         /* transfer_map */
+   u_default_transfer_flush_region, /* transfer_flush_region */
+   si_texture_transfer_unmap,       /* transfer_unmap */
  };
  
  /* Return if it's allowed to reinterpret one format as another with DCC enabled.
   */
-bool vi_dcc_formats_compatible(struct si_screen *sscreen,
-                              enum pipe_format format1,
-                              enum pipe_format format2)
+bool vi_dcc_formats_compatible(struct si_screen *sscreen, enum pipe_format format1,
+                               enum pipe_format format2)
  {
-       const struct util_format_description *desc1, *desc2;
-
-       /* No format change - exit early. */
-       if (format1 == format2)
-               return true;
-
-       format1 = si_simplify_cb_format(format1);
-       format2 = si_simplify_cb_format(format2);
-
-       /* Check again after format adjustments. */
-       if (format1 == format2)
-               return true;
-
-       desc1 = util_format_description(format1);
-       desc2 = util_format_description(format2);
-
-       if (desc1->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
-           desc2->layout != UTIL_FORMAT_LAYOUT_PLAIN)
-               return false;
-
-       /* Float and non-float are totally incompatible. */
-       if ((desc1->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) !=
-           (desc2->channel[0].type == UTIL_FORMAT_TYPE_FLOAT))
-               return false;
-
-       /* Channel sizes must match across DCC formats.
-        * Comparing just the first 2 channels should be enough.
-        */
-       if (desc1->channel[0].size != desc2->channel[0].size ||
-           (desc1->nr_channels >= 2 &&
-            desc1->channel[1].size != desc2->channel[1].size))
-               return false;
-
-       /* Everything below is not needed if the driver never uses the DCC
-        * clear code with the value of 1.
-        */
-
-       /* If the clear values are all 1 or all 0, this constraint can be
-        * ignored. */
-       if (vi_alpha_is_on_msb(sscreen, format1) != vi_alpha_is_on_msb(sscreen, format2))
-               return false;
-
-       /* Channel types must match if the clear value of 1 is used.
-        * The type categories are only float, signed, unsigned.
-        * NORM and INT are always compatible.
-        */
-       if (desc1->channel[0].type != desc2->channel[0].type ||
-           (desc1->nr_channels >= 2 &&
-            desc1->channel[1].type != desc2->channel[1].type))
-               return false;
-
-       return true;
+   const struct util_format_description *desc1, *desc2;
+
+   /* No format change - exit early. */
+   if (format1 == format2)
+      return true;
+
+   format1 = si_simplify_cb_format(format1);
+   format2 = si_simplify_cb_format(format2);
+
+   /* Check again after format adjustments. */
+   if (format1 == format2)
+      return true;
+
+   desc1 = util_format_description(format1);
+   desc2 = util_format_description(format2);
+
+   if (desc1->layout != UTIL_FORMAT_LAYOUT_PLAIN || desc2->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+      return false;
+
+   /* Float and non-float are totally incompatible. */
+   if ((desc1->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) !=
+       (desc2->channel[0].type == UTIL_FORMAT_TYPE_FLOAT))
+      return false;
+
+   /* Channel sizes must match across DCC formats.
+    * Comparing just the first 2 channels should be enough.
+    */
+   if (desc1->channel[0].size != desc2->channel[0].size ||
+       (desc1->nr_channels >= 2 && desc1->channel[1].size != desc2->channel[1].size))
+      return false;
+
+   /* Everything below is not needed if the driver never uses the DCC
+    * clear code with the value of 1.
+    */
+
+   /* If the clear values are all 1 or all 0, this constraint can be
+    * ignored. */
+   if (vi_alpha_is_on_msb(sscreen, format1) != vi_alpha_is_on_msb(sscreen, format2))
+      return false;
+
+   /* Channel types must match if the clear value of 1 is used.
+    * The type categories are only float, signed, unsigned.
+    * NORM and INT are always compatible.
+    */
+   if (desc1->channel[0].type != desc2->channel[0].type ||
+       (desc1->nr_channels >= 2 && desc1->channel[1].type != desc2->channel[1].type))
+      return false;
+
+   return true;
  }
  
-bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex,
-                                    unsigned level,
-                                    enum pipe_format view_format)
+bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex, unsigned level,
+                                     enum pipe_format view_format)
  {
-       struct si_texture *stex = (struct si_texture *)tex;
+   struct si_texture *stex = (struct si_texture *)tex;
  
-       return vi_dcc_enabled(stex, level) &&
-              !vi_dcc_formats_compatible((struct si_screen*)tex->screen,
-                                         tex->format, view_format);
+   return vi_dcc_enabled(stex, level) &&
+          !vi_dcc_formats_compatible((struct si_screen *)tex->screen, tex->format, view_format);
  }
  
  /* This can't be merged with the above function, because
   * vi_dcc_formats_compatible should be called only when DCC is enabled. */
-void vi_disable_dcc_if_incompatible_format(struct si_context *sctx,
-                                          struct pipe_resource *tex,
-                                          unsigned level,
-                                          enum pipe_format view_format)
+void vi_disable_dcc_if_incompatible_format(struct si_context *sctx, struct pipe_resource *tex,
+                                           unsigned level, enum pipe_format view_format)
  {
-       struct si_texture *stex = (struct si_texture *)tex;
+   struct si_texture *stex = (struct si_texture *)tex;
  
-       if (vi_dcc_formats_are_incompatible(tex, level, view_format))
-               if (!si_texture_disable_dcc(sctx, stex))
-                       si_decompress_dcc(sctx, stex);
+   if (vi_dcc_formats_are_incompatible(tex, level, view_format))
+      if (!si_texture_disable_dcc(sctx, stex))
+         si_decompress_dcc(sctx, stex);
  }
  
  struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe,
-                                             struct pipe_resource *texture,
-                                             const struct pipe_surface *templ,
-                                             unsigned width0, unsigned height0,
-                                             unsigned width, unsigned height)
+                                              struct pipe_resource *texture,
+                                              const struct pipe_surface *templ, unsigned width0,
+                                              unsigned height0, unsigned width, unsigned height)
  {
-       struct si_surface *surface = CALLOC_STRUCT(si_surface);
-
-       if (!surface)
-               return NULL;
-
-       assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level));
-       assert(templ->u.tex.last_layer <= util_max_layer(texture, templ->u.tex.level));
-
-       pipe_reference_init(&surface->base.reference, 1);
-       pipe_resource_reference(&surface->base.texture, texture);
-       surface->base.context = pipe;
-       surface->base.format = templ->format;
-       surface->base.width = width;
-       surface->base.height = height;
-       surface->base.u = templ->u;
-
-       surface->width0 = width0;
-       surface->height0 = height0;
-
-       surface->dcc_incompatible =
-               texture->target != PIPE_BUFFER &&
-               vi_dcc_formats_are_incompatible(texture, templ->u.tex.level,
-                                               templ->format);
-       return &surface->base;
+   struct si_surface *surface = CALLOC_STRUCT(si_surface);
+
+   if (!surface)
+      return NULL;
+
+   assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level));
+   assert(templ->u.tex.last_layer <= util_max_layer(texture, templ->u.tex.level));
+
+   pipe_reference_init(&surface->base.reference, 1);
+   pipe_resource_reference(&surface->base.texture, texture);
+   surface->base.context = pipe;
+   surface->base.format = templ->format;
+   surface->base.width = width;
+   surface->base.height = height;
+   surface->base.u = templ->u;
+
+   surface->width0 = width0;
+   surface->height0 = height0;
+
+   surface->dcc_incompatible =
+      texture->target != PIPE_BUFFER &&
+      vi_dcc_formats_are_incompatible(texture, templ->u.tex.level, templ->format);
+   return &surface->base;
  }
  
-static struct pipe_surface *si_create_surface(struct pipe_context *pipe,
-                                             struct pipe_resource *tex,
-                                             const struct pipe_surface *templ)
+static struct pipe_surface *si_create_surface(struct pipe_context *pipe, struct pipe_resource *tex,
+                                              const struct pipe_surface *templ)
  {
-       unsigned level = templ->u.tex.level;
-       unsigned width = u_minify(tex->width0, level);
-       unsigned height = u_minify(tex->height0, level);
-       unsigned width0 = tex->width0;
-       unsigned height0 = tex->height0;
-
-       if (tex->target != PIPE_BUFFER && templ->format != tex->format) {
-               const struct util_format_description *tex_desc
-                       = util_format_description(tex->format);
-               const struct util_format_description *templ_desc
-                       = util_format_description(templ->format);
-
-               assert(tex_desc->block.bits == templ_desc->block.bits);
-
-               /* Adjust size of surface if and only if the block width or
-                * height is changed. */
-               if (tex_desc->block.width != templ_desc->block.width ||
-                   tex_desc->block.height != templ_desc->block.height) {
-                       unsigned nblks_x = util_format_get_nblocksx(tex->format, width);
-                       unsigned nblks_y = util_format_get_nblocksy(tex->format, height);
-
-                       width = nblks_x * templ_desc->block.width;
-                       height = nblks_y * templ_desc->block.height;
-
-                       width0 = util_format_get_nblocksx(tex->format, width0);
-                       height0 = util_format_get_nblocksy(tex->format, height0);
-               }
-       }
-
-       return si_create_surface_custom(pipe, tex, templ,
-                                         width0, height0,
-                                         width, height);
+   unsigned level = templ->u.tex.level;
+   unsigned width = u_minify(tex->width0, level);
+   unsigned height = u_minify(tex->height0, level);
+   unsigned width0 = tex->width0;
+   unsigned height0 = tex->height0;
+
+   if (tex->target != PIPE_BUFFER && templ->format != tex->format) {
+      const struct util_format_description *tex_desc = util_format_description(tex->format);
+      const struct util_format_description *templ_desc = util_format_description(templ->format);
+
+      assert(tex_desc->block.bits == templ_desc->block.bits);
+
+      /* Adjust size of surface if and only if the block width or
+       * height is changed. */
+      if (tex_desc->block.width != templ_desc->block.width ||
+          tex_desc->block.height != templ_desc->block.height) {
+         unsigned nblks_x = util_format_get_nblocksx(tex->format, width);
+         unsigned nblks_y = util_format_get_nblocksy(tex->format, height);
+
+         width = nblks_x * templ_desc->block.width;
+         height = nblks_y * templ_desc->block.height;
+
+         width0 = util_format_get_nblocksx(tex->format, width0);
+         height0 = util_format_get_nblocksy(tex->format, height0);
+      }
+   }
+
+   return si_create_surface_custom(pipe, tex, templ, width0, height0, width, height);
  }
  
-static void si_surface_destroy(struct pipe_context *pipe,
-                              struct pipe_surface *surface)
+static void si_surface_destroy(struct pipe_context *pipe, struct pipe_surface *surface)
  {
-       pipe_resource_reference(&surface->texture, NULL);
-       FREE(surface);
+   pipe_resource_reference(&surface->texture, NULL);
+   FREE(surface);
  }
  
  unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap)
  {
-       const struct util_format_description *desc = util_format_description(format);
-
-#define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz)
-
-       if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
-               return V_028C70_SWAP_STD;
-
-       if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
-               return ~0U;
-
-       switch (desc->nr_channels) {
-       case 1:
-               if (HAS_SWIZZLE(0,X))
-                       return V_028C70_SWAP_STD; /* X___ */
-               else if (HAS_SWIZZLE(3,X))
-                       return V_028C70_SWAP_ALT_REV; /* ___X */
-               break;
-       case 2:
-               if ((HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,Y)) ||
-                   (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,NONE)) ||
-                   (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,Y)))
-                       return V_028C70_SWAP_STD; /* XY__ */
-               else if ((HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,X)) ||
-                        (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,NONE)) ||
-                        (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,X)))
-                       /* YX__ */
-                       return (do_endian_swap ? V_028C70_SWAP_STD : V_028C70_SWAP_STD_REV);
-               else if (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(3,Y))
-                       return V_028C70_SWAP_ALT; /* X__Y */
-               else if (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(3,X))
-                       return V_028C70_SWAP_ALT_REV; /* Y__X */
-               break;
-       case 3:
-               if (HAS_SWIZZLE(0,X))
-                       return (do_endian_swap ? V_028C70_SWAP_STD_REV : V_028C70_SWAP_STD);
-               else if (HAS_SWIZZLE(0,Z))
-                       return V_028C70_SWAP_STD_REV; /* ZYX */
-               break;
-       case 4:
-               /* check the middle channels, the 1st and 4th channel can be NONE */
-               if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z)) {
-                       return V_028C70_SWAP_STD; /* XYZW */
-               } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y)) {
-                       return V_028C70_SWAP_STD_REV; /* WZYX */
-               } else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X)) {
-                       return V_028C70_SWAP_ALT; /* ZYXW */
-               } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,W)) {
-                       /* YZWX */
-                       if (desc->is_array)
-                               return V_028C70_SWAP_ALT_REV;
-                       else
-                               return (do_endian_swap ? V_028C70_SWAP_ALT : V_028C70_SWAP_ALT_REV);
-               }
-               break;
-       }
-       return ~0U;
+   const struct util_format_description *desc = util_format_description(format);
+
+#define HAS_SWIZZLE(chan, swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz)
+
+   if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
+      return V_028C70_SWAP_STD;
+
+   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+      return ~0U;
+
+   switch (desc->nr_channels) {
+   case 1:
+      if (HAS_SWIZZLE(0, X))
+         return V_028C70_SWAP_STD; /* X___ */
+      else if (HAS_SWIZZLE(3, X))
+         return V_028C70_SWAP_ALT_REV; /* ___X */
+      break;
+   case 2:
+      if ((HAS_SWIZZLE(0, X) && HAS_SWIZZLE(1, Y)) || (HAS_SWIZZLE(0, X) && HAS_SWIZZLE(1, NONE)) ||
+          (HAS_SWIZZLE(0, NONE) && HAS_SWIZZLE(1, Y)))
+         return V_028C70_SWAP_STD; /* XY__ */
+      else if ((HAS_SWIZZLE(0, Y) && HAS_SWIZZLE(1, X)) ||
+               (HAS_SWIZZLE(0, Y) && HAS_SWIZZLE(1, NONE)) ||
+               (HAS_SWIZZLE(0, NONE) && HAS_SWIZZLE(1, X)))
+         /* YX__ */
+         return (do_endian_swap ? V_028C70_SWAP_STD : V_028C70_SWAP_STD_REV);
+      else if (HAS_SWIZZLE(0, X) && HAS_SWIZZLE(3, Y))
+         return V_028C70_SWAP_ALT; /* X__Y */
+      else if (HAS_SWIZZLE(0, Y) && HAS_SWIZZLE(3, X))
+         return V_028C70_SWAP_ALT_REV; /* Y__X */
+      break;
+   case 3:
+      if (HAS_SWIZZLE(0, X))
+         return (do_endian_swap ? V_028C70_SWAP_STD_REV : V_028C70_SWAP_STD);
+      else if (HAS_SWIZZLE(0, Z))
+         return V_028C70_SWAP_STD_REV; /* ZYX */
+      break;
+   case 4:
+      /* check the middle channels, the 1st and 4th channel can be NONE */
+      if (HAS_SWIZZLE(1, Y) && HAS_SWIZZLE(2, Z)) {
+         return V_028C70_SWAP_STD; /* XYZW */
+      } else if (HAS_SWIZZLE(1, Z) && HAS_SWIZZLE(2, Y)) {
+         return V_028C70_SWAP_STD_REV; /* WZYX */
+      } else if (HAS_SWIZZLE(1, Y) && HAS_SWIZZLE(2, X)) {
+         return V_028C70_SWAP_ALT; /* ZYXW */
+      } else if (HAS_SWIZZLE(1, Z) && HAS_SWIZZLE(2, W)) {
+         /* YZWX */
+         if (desc->is_array)
+            return V_028C70_SWAP_ALT_REV;
+         else
+            return (do_endian_swap ? V_028C70_SWAP_ALT : V_028C70_SWAP_ALT_REV);
+      }
+      break;
+   }
+   return ~0U;
  }
  
  /* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */
  
-static void vi_dcc_clean_up_context_slot(struct si_context *sctx,
-                                        int slot)
+static void vi_dcc_clean_up_context_slot(struct si_context *sctx, int slot)
  {
-       int i;
+   int i;
  
-       if (sctx->dcc_stats[slot].query_active)
-               vi_separate_dcc_stop_query(sctx,
-                                          sctx->dcc_stats[slot].tex);
+   if (sctx->dcc_stats[slot].query_active)
+      vi_separate_dcc_stop_query(sctx, sctx->dcc_stats[slot].tex);
  
-       for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats[slot].ps_stats); i++)
-               if (sctx->dcc_stats[slot].ps_stats[i]) {
-                       sctx->b.destroy_query(&sctx->b,
-                                             sctx->dcc_stats[slot].ps_stats[i]);
-                       sctx->dcc_stats[slot].ps_stats[i] = NULL;
-               }
+   for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats[slot].ps_stats); i++)
+      if (sctx->dcc_stats[slot].ps_stats[i]) {
+         sctx->b.destroy_query(&sctx->b, sctx->dcc_stats[slot].ps_stats[i]);
+         sctx->dcc_stats[slot].ps_stats[i] = NULL;
+      }
  
-       si_texture_reference(&sctx->dcc_stats[slot].tex, NULL);
+   si_texture_reference(&sctx->dcc_stats[slot].tex, NULL);
  }
  
  /**
   * Return the per-context slot where DCC statistics queries for the texture live.
   */
-static unsigned vi_get_context_dcc_stats_index(struct si_context *sctx,
-                                              struct si_texture *tex)
+static unsigned vi_get_context_dcc_stats_index(struct si_context *sctx, struct si_texture *tex)
  {
-       int i, empty_slot = -1;
-
-       /* Remove zombie textures (textures kept alive by this array only). */
-       for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++)
-               if (sctx->dcc_stats[i].tex &&
-                   sctx->dcc_stats[i].tex->buffer.b.b.reference.count == 1)
-                       vi_dcc_clean_up_context_slot(sctx, i);
-
-       /* Find the texture. */
-       for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
-               /* Return if found. */
-               if (sctx->dcc_stats[i].tex == tex) {
-                       sctx->dcc_stats[i].last_use_timestamp = os_time_get();
-                       return i;
-               }
-
-               /* Record the first seen empty slot. */
-               if (empty_slot == -1 && !sctx->dcc_stats[i].tex)
-                       empty_slot = i;
-       }
-
-       /* Not found. Remove the oldest member to make space in the array. */
-       if (empty_slot == -1) {
-               int oldest_slot = 0;
-
-               /* Find the oldest slot. */
-               for (i = 1; i < ARRAY_SIZE(sctx->dcc_stats); i++)
-                       if (sctx->dcc_stats[oldest_slot].last_use_timestamp >
-                           sctx->dcc_stats[i].last_use_timestamp)
-                               oldest_slot = i;
-
-               /* Clean up the oldest slot. */
-               vi_dcc_clean_up_context_slot(sctx, oldest_slot);
-               empty_slot = oldest_slot;
-       }
-
-       /* Add the texture to the new slot. */
-       si_texture_reference(&sctx->dcc_stats[empty_slot].tex, tex);
-       sctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
-       return empty_slot;
+   int i, empty_slot = -1;
+
+   /* Remove zombie textures (textures kept alive by this array only). */
+   for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++)
+      if (sctx->dcc_stats[i].tex && sctx->dcc_stats[i].tex->buffer.b.b.reference.count == 1)
+         vi_dcc_clean_up_context_slot(sctx, i);
+
+   /* Find the texture. */
+   for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
+      /* Return if found. */
+      if (sctx->dcc_stats[i].tex == tex) {
+         sctx->dcc_stats[i].last_use_timestamp = os_time_get();
+         return i;
+      }
+
+      /* Record the first seen empty slot. */
+      if (empty_slot == -1 && !sctx->dcc_stats[i].tex)
+         empty_slot = i;
+   }
+
+   /* Not found. Remove the oldest member to make space in the array. */
+   if (empty_slot == -1) {
+      int oldest_slot = 0;
+
+      /* Find the oldest slot. */
+      for (i = 1; i < ARRAY_SIZE(sctx->dcc_stats); i++)
+         if (sctx->dcc_stats[oldest_slot].last_use_timestamp >
+             sctx->dcc_stats[i].last_use_timestamp)
+            oldest_slot = i;
+
+      /* Clean up the oldest slot. */
+      vi_dcc_clean_up_context_slot(sctx, oldest_slot);
+      empty_slot = oldest_slot;
+   }
+
+   /* Add the texture to the new slot. */
+   si_texture_reference(&sctx->dcc_stats[empty_slot].tex, tex);
+   sctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
+   return empty_slot;
  }
  
-static struct pipe_query *
-vi_create_resuming_pipestats_query(struct si_context *sctx)
+static struct pipe_query *vi_create_resuming_pipestats_query(struct si_context *sctx)
  {
-       struct si_query_hw *query = (struct si_query_hw*)
-               sctx->b.create_query(&sctx->b, PIPE_QUERY_PIPELINE_STATISTICS, 0);
+   struct si_query_hw *query =
+      (struct si_query_hw *)sctx->b.create_query(&sctx->b, PIPE_QUERY_PIPELINE_STATISTICS, 0);
  
-       query->flags |= SI_QUERY_HW_FLAG_BEGIN_RESUMES;
-       return (struct pipe_query*)query;
+   query->flags |= SI_QUERY_HW_FLAG_BEGIN_RESUMES;
+   return (struct pipe_query *)query;
  }
  
  /**
   * Called when binding a color buffer.
   */
-void vi_separate_dcc_start_query(struct si_context *sctx,
-                                struct si_texture *tex)
+void vi_separate_dcc_start_query(struct si_context *sctx, struct si_texture *tex)
  {
-       unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+   unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
  
-       assert(!sctx->dcc_stats[i].query_active);
+   assert(!sctx->dcc_stats[i].query_active);
  
-       if (!sctx->dcc_stats[i].ps_stats[0])
-               sctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(sctx);
+   if (!sctx->dcc_stats[i].ps_stats[0])
+      sctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(sctx);
  
-       /* begin or resume the query */
-       sctx->b.begin_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
-       sctx->dcc_stats[i].query_active = true;
+   /* begin or resume the query */
+   sctx->b.begin_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
+   sctx->dcc_stats[i].query_active = true;
  }
  
  /**
   * Called when unbinding a color buffer.
   */
-void vi_separate_dcc_stop_query(struct si_context *sctx,
-                               struct si_texture *tex)
+void vi_separate_dcc_stop_query(struct si_context *sctx, struct si_texture *tex)
  {
-       unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+   unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
  
-       assert(sctx->dcc_stats[i].query_active);
-       assert(sctx->dcc_stats[i].ps_stats[0]);
+   assert(sctx->dcc_stats[i].query_active);
+   assert(sctx->dcc_stats[i].ps_stats[0]);
  
-       /* pause or end the query */
-       sctx->b.end_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
-       sctx->dcc_stats[i].query_active = false;
+   /* pause or end the query */
+   sctx->b.end_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
+   sctx->dcc_stats[i].query_active = false;
  }
  
  static bool vi_should_enable_separate_dcc(struct si_texture *tex)
  {
-       /* The minimum number of fullscreen draws per frame that is required
-        * to enable DCC. */
-       return tex->ps_draw_ratio + tex->num_slow_clears >= 5;
+   /* The minimum number of fullscreen draws per frame that is required
+    * to enable DCC. */
+   return tex->ps_draw_ratio + tex->num_slow_clears >= 5;
  }
  
  /* Called by fast clear. */
-void vi_separate_dcc_try_enable(struct si_context *sctx,
-                               struct si_texture *tex)
+void vi_separate_dcc_try_enable(struct si_context *sctx, struct si_texture *tex)
  {
-       /* The intent is to use this with shared displayable back buffers,
-        * but it's not strictly limited only to them.
-        */
-       if (!tex->buffer.b.is_shared ||
-           !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
-           tex->buffer.b.b.target != PIPE_TEXTURE_2D ||
-           tex->buffer.b.b.last_level > 0 ||
-           !tex->surface.dcc_size ||
-           sctx->screen->debug_flags & DBG(NO_DCC) ||
-           sctx->screen->debug_flags & DBG(NO_DCC_FB))
-               return;
-
-       assert(sctx->chip_class >= GFX8);
-
-       if (tex->surface.dcc_offset)
-               return; /* already enabled */
-
-       /* Enable the DCC stat gathering. */
-       if (!tex->dcc_gather_statistics) {
-               tex->dcc_gather_statistics = true;
-               vi_separate_dcc_start_query(sctx, tex);
-       }
-
-       if (!vi_should_enable_separate_dcc(tex))
-               return; /* stats show that DCC decompression is too expensive */
-
-       assert(tex->surface.num_dcc_levels);
-       assert(!tex->dcc_separate_buffer);
-
-       si_texture_discard_cmask(sctx->screen, tex);
-
-       /* Get a DCC buffer. */
-       if (tex->last_dcc_separate_buffer) {
-               assert(tex->dcc_gather_statistics);
-               assert(!tex->dcc_separate_buffer);
-               tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
-               tex->last_dcc_separate_buffer = NULL;
-       } else {
-               tex->dcc_separate_buffer =
-                       si_aligned_buffer_create(sctx->b.screen,
-                                                  SI_RESOURCE_FLAG_UNMAPPABLE,
-                                                  PIPE_USAGE_DEFAULT,
-                                                  tex->surface.dcc_size,
-                                                  tex->surface.dcc_alignment);
-               if (!tex->dcc_separate_buffer)
-                       return;
-       }
-
-       /* dcc_offset is the absolute GPUVM address. */
-       tex->surface.dcc_offset = tex->dcc_separate_buffer->gpu_address;
-
-       /* no need to flag anything since this is called by fast clear that
-        * flags framebuffer state
-        */
+   /* The intent is to use this with shared displayable back buffers,
+    * but it's not strictly limited only to them.
+    */
+   if (!tex->buffer.b.is_shared ||
+       !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
+       tex->buffer.b.b.target != PIPE_TEXTURE_2D || tex->buffer.b.b.last_level > 0 ||
+       !tex->surface.dcc_size || sctx->screen->debug_flags & DBG(NO_DCC) ||
+       sctx->screen->debug_flags & DBG(NO_DCC_FB))
+      return;
+
+   assert(sctx->chip_class >= GFX8);
+
+   if (tex->surface.dcc_offset)
+      return; /* already enabled */
+
+   /* Enable the DCC stat gathering. */
+   if (!tex->dcc_gather_statistics) {
+      tex->dcc_gather_statistics = true;
+      vi_separate_dcc_start_query(sctx, tex);
+   }
+
+   if (!vi_should_enable_separate_dcc(tex))
+      return; /* stats show that DCC decompression is too expensive */
+
+   assert(tex->surface.num_dcc_levels);
+   assert(!tex->dcc_separate_buffer);
+
+   si_texture_discard_cmask(sctx->screen, tex);
+
+   /* Get a DCC buffer. */
+   if (tex->last_dcc_separate_buffer) {
+      assert(tex->dcc_gather_statistics);
+      assert(!tex->dcc_separate_buffer);
+      tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
+      tex->last_dcc_separate_buffer = NULL;
+   } else {
+      tex->dcc_separate_buffer =
+         si_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+                                  tex->surface.dcc_size, tex->surface.dcc_alignment);
+      if (!tex->dcc_separate_buffer)
+         return;
+   }
+
+   /* dcc_offset is the absolute GPUVM address. */
+   tex->surface.dcc_offset = tex->dcc_separate_buffer->gpu_address;
+
+   /* no need to flag anything since this is called by fast clear that
+    * flags framebuffer state
+    */
  }
  
  /**
   * Called by pipe_context::flush_resource, the place where DCC decompression
   * takes place.
   */
-void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
-                                            struct si_texture *tex)
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, struct si_texture *tex)
  {
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct pipe_query *tmp;
-       unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
-       bool query_active = sctx->dcc_stats[i].query_active;
-       bool disable = false;
-
-       if (sctx->dcc_stats[i].ps_stats[2]) {
-               union pipe_query_result result;
-
-               /* Read the results. */
-               struct pipe_query *query = sctx->dcc_stats[i].ps_stats[2];
-               ctx->get_query_result(ctx, query,
-                                     true, &result);
-               si_query_buffer_reset(sctx, &((struct si_query_hw*)query)->buffer);
-
-               /* Compute the approximate number of fullscreen draws. */
-               tex->ps_draw_ratio =
-                       result.pipeline_statistics.ps_invocations /
-                       (tex->buffer.b.b.width0 * tex->buffer.b.b.height0);
-               sctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio;
-
-               disable = tex->dcc_separate_buffer &&
-                         !vi_should_enable_separate_dcc(tex);
-       }
-
-       tex->num_slow_clears = 0;
-
-       /* stop the statistics query for ps_stats[0] */
-       if (query_active)
-               vi_separate_dcc_stop_query(sctx, tex);
-
-       /* Move the queries in the queue by one. */
-       tmp = sctx->dcc_stats[i].ps_stats[2];
-       sctx->dcc_stats[i].ps_stats[2] = sctx->dcc_stats[i].ps_stats[1];
-       sctx->dcc_stats[i].ps_stats[1] = sctx->dcc_stats[i].ps_stats[0];
-       sctx->dcc_stats[i].ps_stats[0] = tmp;
-
-       /* create and start a new query as ps_stats[0] */
-       if (query_active)
-               vi_separate_dcc_start_query(sctx, tex);
-
-       if (disable) {
-               assert(!tex->last_dcc_separate_buffer);
-               tex->last_dcc_separate_buffer = tex->dcc_separate_buffer;
-               tex->dcc_separate_buffer = NULL;
-               tex->surface.dcc_offset = 0;
-               /* no need to flag anything since this is called after
-                * decompression that re-sets framebuffer state
-                */
-       }
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_query *tmp;
+   unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+   bool query_active = sctx->dcc_stats[i].query_active;
+   bool disable = false;
+
+   if (sctx->dcc_stats[i].ps_stats[2]) {
+      union pipe_query_result result;
+
+      /* Read the results. */
+      struct pipe_query *query = sctx->dcc_stats[i].ps_stats[2];
+      ctx->get_query_result(ctx, query, true, &result);
+      si_query_buffer_reset(sctx, &((struct si_query_hw *)query)->buffer);
+
+      /* Compute the approximate number of fullscreen draws. */
+      tex->ps_draw_ratio = result.pipeline_statistics.ps_invocations /
+                           (tex->buffer.b.b.width0 * tex->buffer.b.b.height0);
+      sctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio;
+
+      disable = tex->dcc_separate_buffer && !vi_should_enable_separate_dcc(tex);
+   }
+
+   tex->num_slow_clears = 0;
+
+   /* stop the statistics query for ps_stats[0] */
+   if (query_active)
+      vi_separate_dcc_stop_query(sctx, tex);
+
+   /* Move the queries in the queue by one. */
+   tmp = sctx->dcc_stats[i].ps_stats[2];
+   sctx->dcc_stats[i].ps_stats[2] = sctx->dcc_stats[i].ps_stats[1];
+   sctx->dcc_stats[i].ps_stats[1] = sctx->dcc_stats[i].ps_stats[0];
+   sctx->dcc_stats[i].ps_stats[0] = tmp;
+
+   /* create and start a new query as ps_stats[0] */
+   if (query_active)
+      vi_separate_dcc_start_query(sctx, tex);
+
+   if (disable) {
+      assert(!tex->last_dcc_separate_buffer);
+      tex->last_dcc_separate_buffer = tex->dcc_separate_buffer;
+      tex->dcc_separate_buffer = NULL;
+      tex->surface.dcc_offset = 0;
+      /* no need to flag anything since this is called after
+       * decompression that re-sets framebuffer state
+       */
+   }
  }
  
  static struct pipe_memory_object *
-si_memobj_from_handle(struct pipe_screen *screen,
-                     struct winsys_handle *whandle,
-                     bool dedicated)
+si_memobj_from_handle(struct pipe_screen *screen, struct winsys_handle *whandle, bool dedicated)
  {
-       struct si_screen *sscreen = (struct si_screen*)screen;
-       struct si_memory_object *memobj = CALLOC_STRUCT(si_memory_object);
-       struct pb_buffer *buf = NULL;
-
-       if (!memobj)
-               return NULL;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_memory_object *memobj = CALLOC_STRUCT(si_memory_object);
+   struct pb_buffer *buf = NULL;
  
-       buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle,
-                                             sscreen->info.max_alignment);
-       if (!buf) {
-               free(memobj);
-               return NULL;
-       }
+   if (!memobj)
+      return NULL;
  
-       memobj->b.dedicated = dedicated;
-       memobj->buf = buf;
-       memobj->stride = whandle->stride;
+   buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, sscreen->info.max_alignment);
+   if (!buf) {
+      free(memobj);
+      return NULL;
+   }
  
-       return (struct pipe_memory_object *)memobj;
+   memobj->b.dedicated = dedicated;
+   memobj->buf = buf;
+   memobj->stride = whandle->stride;
  
+   return (struct pipe_memory_object *)memobj;
  }
  
-static void
-si_memobj_destroy(struct pipe_screen *screen,
-                 struct pipe_memory_object *_memobj)
+static void si_memobj_destroy(struct pipe_screen *screen, struct pipe_memory_object *_memobj)
  {
-       struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
+   struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
  
-       pb_reference(&memobj->buf, NULL);
-       free(memobj);
+   pb_reference(&memobj->buf, NULL);
+   free(memobj);
  }
  
-static struct pipe_resource *
-si_texture_from_memobj(struct pipe_screen *screen,
-                      const struct pipe_resource *templ,
-                      struct pipe_memory_object *_memobj,
-                      uint64_t offset)
+static struct pipe_resource *si_texture_from_memobj(struct pipe_screen *screen,
+                                                    const struct pipe_resource *templ,
+                                                    struct pipe_memory_object *_memobj,
+                                                    uint64_t offset)
  {
-       struct si_screen *sscreen = (struct si_screen*)screen;
-       struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
-       struct pipe_resource *tex =
-               si_texture_from_winsys_buffer(sscreen, templ, memobj->buf,
-                                             memobj->stride, offset,
-                                             PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE |
-                                             PIPE_HANDLE_USAGE_SHADER_WRITE,
-                                             memobj->b.dedicated);
-       if (!tex)
-               return NULL;
-
-       /* si_texture_from_winsys_buffer doesn't increment refcount of
-        * memobj->buf, so increment it here.
-        */
-       struct pb_buffer *buf = NULL;
-       pb_reference(&buf, memobj->buf);
-       return tex;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
+   struct pipe_resource *tex = si_texture_from_winsys_buffer(
+      sscreen, templ, memobj->buf, memobj->stride, offset,
+      PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE | PIPE_HANDLE_USAGE_SHADER_WRITE, memobj->b.dedicated);
+   if (!tex)
+      return NULL;
+
+   /* si_texture_from_winsys_buffer doesn't increment refcount of
+    * memobj->buf, so increment it here.
+    */
+   struct pb_buffer *buf = NULL;
+   pb_reference(&buf, memobj->buf);
+   return tex;
  }
  
-static bool si_check_resource_capability(struct pipe_screen *screen,
-                                        struct pipe_resource *resource,
-                                        unsigned bind)
+static bool si_check_resource_capability(struct pipe_screen *screen, struct pipe_resource *resource,
+                                         unsigned bind)
  {
-       struct si_texture *tex = (struct si_texture*)resource;
+   struct si_texture *tex = (struct si_texture *)resource;
  
-       /* Buffers only support the linear flag. */
-       if (resource->target == PIPE_BUFFER)
-               return (bind & ~PIPE_BIND_LINEAR) == 0;
+   /* Buffers only support the linear flag. */
+   if (resource->target == PIPE_BUFFER)
+      return (bind & ~PIPE_BIND_LINEAR) == 0;
  
-       if (bind & PIPE_BIND_LINEAR && !tex->surface.is_linear)
-               return false;
+   if (bind & PIPE_BIND_LINEAR && !tex->surface.is_linear)
+      return false;
  
-       if (bind & PIPE_BIND_SCANOUT && !tex->surface.is_displayable)
-               return false;
+   if (bind & PIPE_BIND_SCANOUT && !tex->surface.is_displayable)
+      return false;
  
-       /* TODO: PIPE_BIND_CURSOR - do we care? */
-       return true;
+   /* TODO: PIPE_BIND_CURSOR - do we care? */
+   return true;
  }
  
  void si_init_screen_texture_functions(struct si_screen *sscreen)
  {
-       sscreen->b.resource_from_handle = si_texture_from_handle;
-       sscreen->b.resource_get_handle = si_texture_get_handle;
-       sscreen->b.resource_get_param = si_resource_get_param;
-       sscreen->b.resource_get_info = si_texture_get_info;
-       sscreen->b.resource_from_memobj = si_texture_from_memobj;
-       sscreen->b.memobj_create_from_handle = si_memobj_from_handle;
-       sscreen->b.memobj_destroy = si_memobj_destroy;
-       sscreen->b.check_resource_capability = si_check_resource_capability;
+   sscreen->b.resource_from_handle = si_texture_from_handle;
+   sscreen->b.resource_get_handle = si_texture_get_handle;
+   sscreen->b.resource_get_param = si_resource_get_param;
+   sscreen->b.resource_get_info = si_texture_get_info;
+   sscreen->b.resource_from_memobj = si_texture_from_memobj;
+   sscreen->b.memobj_create_from_handle = si_memobj_from_handle;
+   sscreen->b.memobj_destroy = si_memobj_destroy;
+   sscreen->b.check_resource_capability = si_check_resource_capability;
  }
  
  void si_init_context_texture_functions(struct si_context *sctx)
  {
-       sctx->b.create_surface = si_create_surface;
-       sctx->b.surface_destroy = si_surface_destroy;
+   sctx->b.create_surface = si_create_surface;
+   sctx->b.surface_destroy = si_surface_destroy;
  }
diff --git a/src/gallium/drivers/radeonsi/si_uvd.c b/src/gallium/drivers/radeonsi/si_uvd.c

index 5511c2d7ad2a6a361840f31baba92b680a0934f4..0f38cce0f9603bcc0dd8ccc32554d8e7c5358b0e 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_uvd.c
+++ b/src/gallium/drivers/radeonsi/si_uvd.c
@@ -25,79 +25,77 @@
   *
   **************************************************************************/
  
-#include "si_pipe.h"
-#include "radeon/radeon_video.h"
  #include "radeon/radeon_uvd.h"
+#include "radeon/radeon_uvd_enc.h"
  #include "radeon/radeon_vce.h"
  #include "radeon/radeon_vcn_dec.h"
  #include "radeon/radeon_vcn_enc.h"
-#include "radeon/radeon_uvd_enc.h"
+#include "radeon/radeon_video.h"
+#include "si_pipe.h"
  #include "util/u_video.h"
  
  /**
   * creates an video buffer with an UVD compatible memory layout
   */
  struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
-                                                const struct pipe_video_buffer *tmpl)
+                                                 const struct pipe_video_buffer *tmpl)
  {
-       struct pipe_video_buffer vidbuf = *tmpl;
-       /* TODO: get tiling working */
-       vidbuf.bind |= PIPE_BIND_LINEAR;
+   struct pipe_video_buffer vidbuf = *tmpl;
+   /* TODO: get tiling working */
+   vidbuf.bind |= PIPE_BIND_LINEAR;
  
-       return vl_video_buffer_create_as_resource(pipe, &vidbuf);
+   return vl_video_buffer_create_as_resource(pipe, &vidbuf);
  }
  
  /* set the decoding target buffer offsets */
-static struct pb_buffer* si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
+static struct pb_buffer *si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
  {
-       struct si_screen *sscreen = (struct si_screen*)buf->base.context->screen;
-       struct si_texture *luma = (struct si_texture *)buf->resources[0];
-       struct si_texture *chroma = (struct si_texture *)buf->resources[1];
-       enum ruvd_surface_type type =  (sscreen->info.chip_class >= GFX9) ?
-                                       RUVD_SURFACE_TYPE_GFX9 :
-                                       RUVD_SURFACE_TYPE_LEGACY;
+   struct si_screen *sscreen = (struct si_screen *)buf->base.context->screen;
+   struct si_texture *luma = (struct si_texture *)buf->resources[0];
+   struct si_texture *chroma = (struct si_texture *)buf->resources[1];
+   enum ruvd_surface_type type =
+      (sscreen->info.chip_class >= GFX9) ? RUVD_SURFACE_TYPE_GFX9 : RUVD_SURFACE_TYPE_LEGACY;
  
-       msg->body.decode.dt_field_mode = buf->base.interlaced;
+   msg->body.decode.dt_field_mode = buf->base.interlaced;
  
-       si_uvd_set_dt_surfaces(msg, &luma->surface, (chroma) ? &chroma->surface : NULL, type);
+   si_uvd_set_dt_surfaces(msg, &luma->surface, (chroma) ? &chroma->surface : NULL, type);
  
-       return luma->buffer.buf;
+   return luma->buffer.buf;
  }
  
  /* get the radeon resources for VCE */
-static void si_vce_get_buffer(struct pipe_resource *resource,
-                             struct pb_buffer **handle,
-                             struct radeon_surf **surface)
+static void si_vce_get_buffer(struct pipe_resource *resource, struct pb_buffer **handle,
+                              struct radeon_surf **surface)
  {
-       struct si_texture *res = (struct si_texture *)resource;
+   struct si_texture *res = (struct si_texture *)resource;
  
-       if (handle)
-               *handle = res->buffer.buf;
+   if (handle)
+      *handle = res->buffer.buf;
  
-       if (surface)
-               *surface = &res->surface;
+   if (surface)
+      *surface = &res->surface;
  }
  
  /**
   * creates an UVD compatible decoder
   */
  struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
-                                              const struct pipe_video_codec *templ)
+                                               const struct pipe_video_codec *templ)
  {
-       struct si_context *ctx = (struct si_context *)context;
-       bool vcn = ctx->family >= CHIP_RAVEN;
+   struct si_context *ctx = (struct si_context *)context;
+   bool vcn = ctx->family >= CHIP_RAVEN;
  
-       if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
-               if (vcn) {
-                       return radeon_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
-               } else {
-                       if (u_reduce_video_profile(templ->profile) == PIPE_VIDEO_FORMAT_HEVC)
-                               return radeon_uvd_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
-                       else
-                               return si_vce_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
-               }
-       }
+   if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
+      if (vcn) {
+         return radeon_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
+      } else {
+         if (u_reduce_video_profile(templ->profile) == PIPE_VIDEO_FORMAT_HEVC)
+            return radeon_uvd_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
+         else
+            return si_vce_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
+      }
+   }
  
-       return (vcn) ?  radeon_create_decoder(context, templ) :
-               si_common_uvd_create_decoder(context, templ, si_uvd_set_dtb);
+   return (vcn) ? radeon_create_decoder(context, templ)
+                : si_common_uvd_create_decoder(context, templ, si_uvd_set_dtb);
  }
author	Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
	Fri, 27 Mar 2020 18:32:38 +0000 (19:32 +0100)
committer	Marge Bot <eric+marge@anholt.net>
	Mon, 30 Mar 2020 11:05:52 +0000 (11:05 +0000)
src/gallium/drivers/radeonsi/.editorconfig	[deleted file]	patch \| blob \| history
src/gallium/drivers/radeonsi/cik_sdma.c		patch \| blob \| history
src/gallium/drivers/radeonsi/driinfo_radeonsi.h		patch \| blob \| history
src/gallium/drivers/radeonsi/gfx10_query.c		patch \| blob \| history
src/gallium/drivers/radeonsi/gfx10_shader_ngg.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_blit.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_buffer.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_build_pm4.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_clear.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_compute.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_compute.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_compute_blit.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_compute_prim_discard.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_cp_dma.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_debug.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_debug_options.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_descriptors.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_dma_cs.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_fence.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_get.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_gfx_cs.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_gpu_load.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_perfcounter.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_pipe.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_pipe.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_pm4.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_pm4.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_query.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_query.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_internal.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_llvm.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_llvm_gs.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_llvm_ps.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_llvm_resources.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_llvm_tess.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_llvm_vs.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_nir.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state_binning.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state_draw.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state_msaa.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state_shaders.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state_streamout.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state_viewport.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_test_dma.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_test_dma_perf.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_texture.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_uvd.c		patch \| blob \| history